[PATCH 1/5] target/arm/ptw: Load stage-2 tables from realm physical space
In realm state, stage-2 translation tables are fetched from the realm physical address space (R_PGRQD). Signed-off-by: Jean-Philippe Brucker --- target/arm/ptw.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/target/arm/ptw.c b/target/arm/ptw.c index d1de934702..6318e13b98 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -164,7 +164,11 @@ static ARMMMUIdx ptw_idx_for_stage_2(CPUARMState *env, ARMMMUIdx stage2idx) * an NS stage 1+2 lookup while the NS bit is 0.) */ if (!arm_is_secure_below_el3(env) || !arm_el_is_aa64(env, 3)) { -return ARMMMUIdx_Phys_NS; +if (arm_security_space_below_el3(env) == ARMSS_Realm) { +return ARMMMUIdx_Phys_Realm; +} else { +return ARMMMUIdx_Phys_NS; +} } if (stage2idx == ARMMMUIdx_Stage2_S) { s2walk_secure = !(env->cp15.vstcr_el2 & VSTCR_SW); -- 2.41.0
[PATCH 2/5] target/arm/helper: Fix vae2_tlbmask()
When HCR_EL2.E2H is enabled, TLB entries are formed using the EL2&0 translation regime, instead of the EL2 translation regime. The TLB VAE2* instructions invalidate the regime that corresponds to the current value of HCR_EL2.E2H. At the moment we only invalidate the EL2 translation regime. This causes problems with RMM, which issues TLBI VAE2IS instructions with HCR_EL2.E2H enabled. Update vae2_tlbmask() to take HCR_EL2.E2H into account. Signed-off-by: Jean-Philippe Brucker --- target/arm/helper.c | 26 ++ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/target/arm/helper.c b/target/arm/helper.c index e1b3db6f5f..07a9ac70f5 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -4663,6 +4663,21 @@ static int vae1_tlbmask(CPUARMState *env) return mask; } +static int vae2_tlbmask(CPUARMState *env) +{ +uint64_t hcr = arm_hcr_el2_eff(env); +uint16_t mask; + +if (hcr & HCR_E2H) { +mask = ARMMMUIdxBit_E20_2 | + ARMMMUIdxBit_E20_2_PAN | + ARMMMUIdxBit_E20_0; +} else { +mask = ARMMMUIdxBit_E2; +} +return mask; +} + /* Return 56 if TBI is enabled, 64 otherwise. */ static int tlbbits_for_regime(CPUARMState *env, ARMMMUIdx mmu_idx, uint64_t addr) @@ -4781,7 +4796,7 @@ static void tlbi_aa64_vae2_write(CPUARMState *env, const ARMCPRegInfo *ri, * flush-last-level-only. */ CPUState *cs = env_cpu(env); -int mask = e2_tlbmask(env); +int mask = vae2_tlbmask(env); uint64_t pageaddr = sextract64(value << 12, 0, 56); tlb_flush_page_by_mmuidx(cs, pageaddr, mask); @@ -4838,11 +4853,11 @@ static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { CPUState *cs = env_cpu(env); +int mask = vae2_tlbmask(env); uint64_t pageaddr = sextract64(value << 12, 0, 56); int bits = tlbbits_for_regime(env, ARMMMUIdx_E2, pageaddr); -tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, - ARMMMUIdxBit_E2, bits); +tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, mask, bits); } static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -5014,11 +5029,6 @@ static void tlbi_aa64_rvae1is_write(CPUARMState *env, do_rvae_write(env, value, vae1_tlbmask(env), true); } -static int vae2_tlbmask(CPUARMState *env) -{ -return ARMMMUIdxBit_E2; -} - static void tlbi_aa64_rvae2_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) -- 2.41.0
[PATCH 3/5] target/arm: Skip granule protection checks for AT instructions
GPC checks are not performed on the output address for AT instructions, as stated by ARM DDI 0487J in D8.12.2: When populating PAR_EL1 with the result of an address translation instruction, granule protection checks are not performed on the final output address of a successful translation. Rename get_phys_addr_with_secure(), since it's only used to handle AT instructions. Signed-off-by: Jean-Philippe Brucker --- This incidentally fixes a problem with AT S1E1 instructions which can output an IPA and should definitely not cause a GPC. --- target/arm/internals.h | 25 ++--- target/arm/helper.c| 8 ++-- target/arm/ptw.c | 11 ++- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/target/arm/internals.h b/target/arm/internals.h index 0f01bc32a8..fc90c364f7 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1190,12 +1190,11 @@ typedef struct GetPhysAddrResult { } GetPhysAddrResult; /** - * get_phys_addr_with_secure: get the physical address for a virtual address + * get_phys_addr: get the physical address for a virtual address * @env: CPUARMState * @address: virtual address to get physical address for * @access_type: 0 for read, 1 for write, 2 for execute * @mmu_idx: MMU index indicating required translation regime - * @is_secure: security state for the access * @result: set on translation success. * @fi: set to fault info if the translation fails * @@ -1212,26 +1211,30 @@ typedef struct GetPhysAddrResult { * * for PSMAv5 based systems we don't bother to return a full FSR format *value. */ -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address, - MMUAccessType access_type, - ARMMMUIdx mmu_idx, bool is_secure, - GetPhysAddrResult *result, ARMMMUFaultInfo *fi) +bool get_phys_addr(CPUARMState *env, target_ulong address, + MMUAccessType access_type, ARMMMUIdx mmu_idx, + GetPhysAddrResult *result, ARMMMUFaultInfo *fi) __attribute__((nonnull)); /** - * get_phys_addr: get the physical address for a virtual address + * get_phys_addr_with_secure_nogpc: get the physical address for a virtual + * address * @env: CPUARMState * @address: virtual address to get physical address for * @access_type: 0 for read, 1 for write, 2 for execute * @mmu_idx: MMU index indicating required translation regime + * @is_secure: security state for the access * @result: set on translation success. * @fi: set to fault info if the translation fails * - * Similarly, but use the security regime of @mmu_idx. + * Similar to get_phys_addr, but use the given security regime and don't perform + * a Granule Protection Check on the resulting address. */ -bool get_phys_addr(CPUARMState *env, target_ulong address, - MMUAccessType access_type, ARMMMUIdx mmu_idx, - GetPhysAddrResult *result, ARMMMUFaultInfo *fi) +bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address, + MMUAccessType access_type, + ARMMMUIdx mmu_idx, bool is_secure, + GetPhysAddrResult *result, + ARMMMUFaultInfo *fi) __attribute__((nonnull)); bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, diff --git a/target/arm/helper.c b/target/arm/helper.c index 07a9ac70f5..3ee2bb5fe1 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -3365,8 +3365,12 @@ static uint64_t do_ats_write(CPUARMState *env, uint64_t value, ARMMMUFaultInfo fi = {}; GetPhysAddrResult res = {}; -ret = get_phys_addr_with_secure(env, value, access_type, mmu_idx, -is_secure, &res, &fi); +/* + * I_MXTJT: Granule protection checks are not performed on the final address + * of a successful translation. + */ +ret = get_phys_addr_with_secure_nogpc(env, value, access_type, mmu_idx, + is_secure, &res, &fi); /* * ATS operations only do S1 or S1+S2 translations, so we never diff --git a/target/arm/ptw.c b/target/arm/ptw.c index 6318e13b98..1aef2b8cef 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -3412,16 +3412,17 @@ static bool get_phys_addr_gpc(CPUARMState *env, S1Translate *ptw, return false; } -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address, - MMUAccessType access_type, ARMMMUIdx mmu_idx, - bool is_secure, GetPhysAddrResult *result, - ARMMMUFaultInfo *fi) +bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address, + MMUAccessType access_type, +
[PATCH 5/5] target/arm/helper: Implement CNTHCTL_EL2.CNT[VP]MASK
When FEAT_RME is implemented, these bits override the value of CNT[VP]_CTL_EL0.IMASK in Realm and Root state. Signed-off-by: Jean-Philippe Brucker --- target/arm/helper.c | 21 +++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/target/arm/helper.c b/target/arm/helper.c index 2017b11795..5b173a827f 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -2608,6 +2608,23 @@ static uint64_t gt_get_countervalue(CPUARMState *env) return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / gt_cntfrq_period_ns(cpu); } +static bool gt_is_masked(CPUARMState *env, int timeridx) +{ +ARMSecuritySpace ss = arm_security_space(env); + +/* + * If bits CNTHCTL_EL2.CNT[VP]MASK are set, they override + * CNT[VP]_CTL_EL0.IMASK. They are RES0 in Secure and NonSecure state. + */ +if ((ss == ARMSS_Root || ss == ARMSS_Realm) && +((timeridx == GTIMER_VIRT && extract64(env->cp15.cnthctl_el2, 18, 1)) || + (timeridx == GTIMER_PHYS && extract64(env->cp15.cnthctl_el2, 19, 1 { +return true; +} + +return env->cp15.c14_timer[timeridx].ctl & 2; +} + static void gt_recalc_timer(ARMCPU *cpu, int timeridx) { ARMGenericTimer *gt = &cpu->env.cp15.c14_timer[timeridx]; @@ -2627,7 +2644,7 @@ static void gt_recalc_timer(ARMCPU *cpu, int timeridx) gt->ctl = deposit32(gt->ctl, 2, 1, istatus); -irqstate = (istatus && !(gt->ctl & 2)); +irqstate = (istatus && !gt_is_masked(&cpu->env, timeridx)); qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate); if (istatus) { @@ -2759,7 +2776,7 @@ static void gt_ctl_write(CPUARMState *env, const ARMCPRegInfo *ri, * IMASK toggled: don't need to recalculate, * just set the interrupt line based on ISTATUS */ -int irqstate = (oldval & 4) && !(value & 2); +int irqstate = (oldval & 4) && !gt_is_masked(env, timeridx); trace_arm_gt_imask_toggle(timeridx, irqstate); qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate); -- 2.41.0
[PATCH 4/5] target/arm: Pass security space rather than flag for AT instructions
At the moment we only handle Secure and Nonsecure security spaces for the AT instructions. Add support for Realm and Root. For AArch64, arm_security_space() gives the desired space. ARM DDI0487J says (R_NYXTL): If EL3 is implemented, then when an address translation instruction that applies to an Exception level lower than EL3 is executed, the Effective value of SCR_EL3.{NSE, NS} determines the target Security state that the instruction applies to. For AArch32, some instructions can access NonSecure space from Secure, so we still need to pass the state explicitly to do_ats_write(). Signed-off-by: Jean-Philippe Brucker --- I haven't tested AT instructions in Realm/Root space yet, but it looks like the patch is needed. RMM doesn't issue AT instructions like KVM does in non-secure state (which triggered the bug in the previous patch). --- target/arm/internals.h | 18 +- target/arm/helper.c| 27 --- target/arm/ptw.c | 12 ++-- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/target/arm/internals.h b/target/arm/internals.h index fc90c364f7..cf13bb94f5 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1217,24 +1217,24 @@ bool get_phys_addr(CPUARMState *env, target_ulong address, __attribute__((nonnull)); /** - * get_phys_addr_with_secure_nogpc: get the physical address for a virtual - * address + * get_phys_addr_with_space_nogpc: get the physical address for a virtual + * address * @env: CPUARMState * @address: virtual address to get physical address for * @access_type: 0 for read, 1 for write, 2 for execute * @mmu_idx: MMU index indicating required translation regime - * @is_secure: security state for the access + * @space: security space for the access * @result: set on translation success. * @fi: set to fault info if the translation fails * - * Similar to get_phys_addr, but use the given security regime and don't perform + * Similar to get_phys_addr, but use the given security space and don't perform * a Granule Protection Check on the resulting address. */ -bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address, - MMUAccessType access_type, - ARMMMUIdx mmu_idx, bool is_secure, - GetPhysAddrResult *result, - ARMMMUFaultInfo *fi) +bool get_phys_addr_with_space_nogpc(CPUARMState *env, target_ulong address, +MMUAccessType access_type, +ARMMMUIdx mmu_idx, ARMSecuritySpace space, +GetPhysAddrResult *result, +ARMMMUFaultInfo *fi) __attribute__((nonnull)); bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, diff --git a/target/arm/helper.c b/target/arm/helper.c index 3ee2bb5fe1..2017b11795 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -3357,7 +3357,7 @@ static int par_el1_shareability(GetPhysAddrResult *res) static uint64_t do_ats_write(CPUARMState *env, uint64_t value, MMUAccessType access_type, ARMMMUIdx mmu_idx, - bool is_secure) + ARMSecuritySpace ss) { bool ret; uint64_t par64; @@ -3369,8 +3369,8 @@ static uint64_t do_ats_write(CPUARMState *env, uint64_t value, * I_MXTJT: Granule protection checks are not performed on the final address * of a successful translation. */ -ret = get_phys_addr_with_secure_nogpc(env, value, access_type, mmu_idx, - is_secure, &res, &fi); +ret = get_phys_addr_with_space_nogpc(env, value, access_type, mmu_idx, ss, + &res, &fi); /* * ATS operations only do S1 or S1+S2 translations, so we never @@ -3535,7 +3535,7 @@ static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) uint64_t par64; ARMMMUIdx mmu_idx; int el = arm_current_el(env); -bool secure = arm_is_secure_below_el3(env); +ARMSecuritySpace ss = arm_security_space(env); switch (ri->opc2 & 6) { case 0: @@ -3543,10 +3543,9 @@ static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) switch (el) { case 3: mmu_idx = ARMMMUIdx_E3; -secure = true; break; case 2: -g_assert(!secure); /* ARMv8.4-SecEL2 is 64-bit only */ +g_assert(ss != ARMSS_Secure); /* ARMv8.4-SecEL2 is 64-bit only */ /* fall through */ case 1: if (ri->crm == 9 && (env->uncached_cpsr & CPSR_PAN)) { @@ -3564,10 +3563,9 @@ static void ats_write(CPUARMState *env,
[PATCH 0/5] target/arm: Fixes for RME
With these patches I'm able to boot a Realm guest under "-cpu max,x-rme=on". They are based on Peter's series which fixes handling of NSTable: https://lore.kernel.org/qemu-devel/20230714154648.327466-1-peter.mayd...@linaro.org/ Running a Realm guest requires components at EL3 and R-EL2. Some rough support for TF-A and RMM is available here: https://jpbrucker.net/git/tf-a/log/?h=qemu-rme https://jpbrucker.net/git/rmm/log/?h=qemu-rme I'll clean this up before sending it out. I also need to manually disable FEAT_SME in QEMU in order to boot this, otherwise the Linux host fails to boot because hyp-stub accesses to SME regs are trapped to EL3, which doesn't support RME+SME at the moment. The right fix is probably in TF-A but I haven't investigated yet. Jean-Philippe Brucker (5): target/arm/ptw: Load stage-2 tables from realm physical space target/arm/helper: Fix vae2_tlbmask() target/arm: Skip granule protection checks for AT instructions target/arm: Pass security space rather than flag for AT instructions target/arm/helper: Implement CNTHCTL_EL2.CNT[VP]MASK target/arm/internals.h | 25 -- target/arm/helper.c| 78 -- target/arm/ptw.c | 19 ++ 3 files changed, 79 insertions(+), 43 deletions(-) -- 2.41.0
Re: [PATCH for-8.1] virtio-iommu: Standardize granule extraction and formatting
On Tue, Jul 18, 2023 at 08:21:36PM +0200, Eric Auger wrote: > At several locations we compute the granule from the config > page_size_mask using ctz() and then format it in traces using > BIT(). As the page_size_mask is 64b we should use ctz64 and > BIT_ULL() for formatting. We failed to be consistent. > > Note the page_size_mask is garanteed to be non null. The spec > mandates the device to set at least one bit, so ctz64 cannot > return 64. This is garanteed by the fact the device > initializes the page_size_mask to qemu_target_page_mask() > and then the page_size_mask is further constrained by > virtio_iommu_set_page_size_mask() callback which can't > result in a new mask being null. So if Coverity complains > round those ctz64/BIT_ULL with CID 1517772 this is a false > positive > > Signed-off-by: Eric Auger > Fixes: 94df5b2180 ("virtio-iommu: Fix 64kB host page size VFIO device > assignment") Reviewed-by: Jean-Philippe Brucker > --- > hw/virtio/virtio-iommu.c | 8 +--- > 1 file changed, 5 insertions(+), 3 deletions(-) > > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > index 201127c488..c6ee4d7a3c 100644 > --- a/hw/virtio/virtio-iommu.c > +++ b/hw/virtio/virtio-iommu.c > @@ -852,17 +852,19 @@ static IOMMUTLBEntry > virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, > VirtIOIOMMUEndpoint *ep; > uint32_t sid, flags; > bool bypass_allowed; > +int granule; > bool found; > int i; > > interval.low = addr; > interval.high = addr + 1; > +granule = ctz64(s->config.page_size_mask); > > IOMMUTLBEntry entry = { > .target_as = &address_space_memory, > .iova = addr, > .translated_addr = addr, > -.addr_mask = (1 << ctz32(s->config.page_size_mask)) - 1, > +.addr_mask = BIT_ULL(granule) - 1, > .perm = IOMMU_NONE, > }; > > @@ -1115,7 +1117,7 @@ static int > virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, > if (s->granule_frozen) { > int cur_granule = ctz64(cur_mask); > > -if (!(BIT(cur_granule) & new_mask)) { > +if (!(BIT_ULL(cur_granule) & new_mask)) { > error_setg(errp, "virtio-iommu %s does not support frozen > granule 0x%llx", > mr->parent_obj.name, BIT_ULL(cur_granule)); > return -1; > @@ -1161,7 +1163,7 @@ static void virtio_iommu_freeze_granule(Notifier > *notifier, void *data) > } > s->granule_frozen = true; > granule = ctz64(s->config.page_size_mask); > -trace_virtio_iommu_freeze_granule(BIT(granule)); > +trace_virtio_iommu_freeze_granule(BIT_ULL(granule)); > } > > static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) > -- > 2.38.1 >
Re: [PATCH 0/5] target/arm: Fixes for RME
On Thu, Jul 20, 2023 at 01:05:58PM +0100, Peter Maydell wrote: > On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker > wrote: > > > > With these patches I'm able to boot a Realm guest under > > "-cpu max,x-rme=on". They are based on Peter's series which fixes > > handling of NSTable: > > https://lore.kernel.org/qemu-devel/20230714154648.327466-1-peter.mayd...@linaro.org/ > > Thanks for testing this -- this is a lot closer to > working out of the box than I thought we might be :-) > I'm tempted to try to put these fixes (and my ptw patchset) > into 8.1, but OTOH I suspect work on Realm guests will probably > still want to use a bleeding-edge QEMU for other bugs we're > going to discover over the next few months, so IDK. We'll > see how code review goes on those, I guess. > > > Running a Realm guest requires components at EL3 and R-EL2. Some rough > > support for TF-A and RMM is available here: > > https://jpbrucker.net/git/tf-a/log/?h=qemu-rme > > https://jpbrucker.net/git/rmm/log/?h=qemu-rme > > I'll clean this up before sending it out. > > > > I also need to manually disable FEAT_SME in QEMU in order to boot this, > > Do you mean you needed to do something more invasive than > '-cpu max,x-rme=on,sme=off' ? Ah no, I hadn't noticed there was a sme=off switch, that's much better Thanks, Jean > > > otherwise the Linux host fails to boot because hyp-stub accesses to SME > > regs are trapped to EL3, which doesn't support RME+SME at the moment. > > The right fix is probably in TF-A but I haven't investigated yet. > > thanks > -- PMM
Re: [PATCH 2/5] target/arm/helper: Fix vae2_tlbmask()
On Thu, Jul 20, 2023 at 05:35:49PM +0100, Peter Maydell wrote: > On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker > wrote: > > > > When HCR_EL2.E2H is enabled, TLB entries are formed using the EL2&0 > > translation regime, instead of the EL2 translation regime. The TLB VAE2* > > instructions invalidate the regime that corresponds to the current value > > of HCR_EL2.E2H. > > > > At the moment we only invalidate the EL2 translation regime. This causes > > problems with RMM, which issues TLBI VAE2IS instructions with > > HCR_EL2.E2H enabled. Update vae2_tlbmask() to take HCR_EL2.E2H into > > account. > > > > Signed-off-by: Jean-Philippe Brucker > > --- > > target/arm/helper.c | 26 ++ > > 1 file changed, 18 insertions(+), 8 deletions(-) > > > > diff --git a/target/arm/helper.c b/target/arm/helper.c > > index e1b3db6f5f..07a9ac70f5 100644 > > --- a/target/arm/helper.c > > +++ b/target/arm/helper.c > > @@ -4663,6 +4663,21 @@ static int vae1_tlbmask(CPUARMState *env) > > return mask; > > } > > > > +static int vae2_tlbmask(CPUARMState *env) > > +{ > > +uint64_t hcr = arm_hcr_el2_eff(env); > > +uint16_t mask; > > + > > +if (hcr & HCR_E2H) { > > +mask = ARMMMUIdxBit_E20_2 | > > + ARMMMUIdxBit_E20_2_PAN | > > + ARMMMUIdxBit_E20_0; > > +} else { > > +mask = ARMMMUIdxBit_E2; > > +} > > +return mask; > > +} > > + > > /* Return 56 if TBI is enabled, 64 otherwise. */ > > static int tlbbits_for_regime(CPUARMState *env, ARMMMUIdx mmu_idx, > >uint64_t addr) > > > @@ -4838,11 +4853,11 @@ static void tlbi_aa64_vae2is_write(CPUARMState > > *env, const ARMCPRegInfo *ri, > > uint64_t value) > > { > > CPUState *cs = env_cpu(env); > > +int mask = vae2_tlbmask(env); > > uint64_t pageaddr = sextract64(value << 12, 0, 56); > > int bits = tlbbits_for_regime(env, ARMMMUIdx_E2, pageaddr); > > Shouldn't the argument to tlbbits_for_regime() also change > if we're dealing with the EL2&0 regime rather than EL2 ? Yes, it affects the result since EL2&0 has two ranges Thanks, Jean > > > > > -tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, > > - ARMMMUIdxBit_E2, bits); > > +tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, mask, > > bits); > > } > > thanks > -- PMM
Re: [PATCH 3/5] target/arm: Skip granule protection checks for AT instructions
On Thu, Jul 20, 2023 at 05:39:56PM +0100, Peter Maydell wrote: > On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker > wrote: > > > > GPC checks are not performed on the output address for AT instructions, > > as stated by ARM DDI 0487J in D8.12.2: > > > > When populating PAR_EL1 with the result of an address translation > > instruction, granule protection checks are not performed on the final > > output address of a successful translation. > > > > Rename get_phys_addr_with_secure(), since it's only used to handle AT > > instructions. > > > > Signed-off-by: Jean-Philippe Brucker > > --- > > This incidentally fixes a problem with AT S1E1 instructions which can > > output an IPA and should definitely not cause a GPC. > > --- > > target/arm/internals.h | 25 ++--- > > target/arm/helper.c| 8 ++-- > > target/arm/ptw.c | 11 ++- > > 3 files changed, 26 insertions(+), 18 deletions(-) > > > > diff --git a/target/arm/internals.h b/target/arm/internals.h > > index 0f01bc32a8..fc90c364f7 100644 > > --- a/target/arm/internals.h > > +++ b/target/arm/internals.h > > @@ -1190,12 +1190,11 @@ typedef struct GetPhysAddrResult { > > } GetPhysAddrResult; > > > > /** > > - * get_phys_addr_with_secure: get the physical address for a virtual > > address > > + * get_phys_addr: get the physical address for a virtual address > > * @env: CPUARMState > > * @address: virtual address to get physical address for > > * @access_type: 0 for read, 1 for write, 2 for execute > > * @mmu_idx: MMU index indicating required translation regime > > - * @is_secure: security state for the access > > * @result: set on translation success. > > * @fi: set to fault info if the translation fails > > * > > @@ -1212,26 +1211,30 @@ typedef struct GetPhysAddrResult { > > * * for PSMAv5 based systems we don't bother to return a full FSR format > > *value. > > */ > > -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address, > > - MMUAccessType access_type, > > - ARMMMUIdx mmu_idx, bool is_secure, > > - GetPhysAddrResult *result, ARMMMUFaultInfo > > *fi) > > +bool get_phys_addr(CPUARMState *env, target_ulong address, > > + MMUAccessType access_type, ARMMMUIdx mmu_idx, > > + GetPhysAddrResult *result, ARMMMUFaultInfo *fi) > > __attribute__((nonnull)); > > > What is going on in this bit of the patch ? We already > have a prototype for get_phys_addr() with a doc comment. > Is this git's diff-output producing something silly > for a change that is not logically touching get_phys_addr()'s > prototype and comment at all? I swapped the two prototypes in order to make the new comment for get_phys_addr_with_secure_nogpc() more clear. Tried to convey that get_phys_addr() is the normal function and get_phys_addr_with_secure_nogpc() is special. So git is working as expected and this is a style change, I can switch them back if you prefer. Thanks, Jean > > > /** > > - * get_phys_addr: get the physical address for a virtual address > > + * get_phys_addr_with_secure_nogpc: get the physical address for a virtual > > + * address > > * @env: CPUARMState > > * @address: virtual address to get physical address for > > * @access_type: 0 for read, 1 for write, 2 for execute > > * @mmu_idx: MMU index indicating required translation regime > > + * @is_secure: security state for the access > > * @result: set on translation success. > > * @fi: set to fault info if the translation fails > > * > > - * Similarly, but use the security regime of @mmu_idx. > > + * Similar to get_phys_addr, but use the given security regime and don't > > perform > > + * a Granule Protection Check on the resulting address. > > */ > > -bool get_phys_addr(CPUARMState *env, target_ulong address, > > - MMUAccessType access_type, ARMMMUIdx mmu_idx, > > - GetPhysAddrResult *result, ARMMMUFaultInfo *fi) > > +bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong > > address, > > + MMUAccessType access_type, > > + ARMMMUIdx mmu_idx, bool is_secure, > > + GetPhysAddrResult *result, > > + ARMMMUFaultInfo *fi) > > __attribute__((nonnull)); > > thanks > -- PMM
[PATCH v2 4/6] target/arm: Pass security space rather than flag for AT instructions
At the moment we only handle Secure and Nonsecure security spaces for the AT instructions. Add support for Realm and Root. For AArch64, arm_security_space() gives the desired space. ARM DDI0487J says (R_NYXTL): If EL3 is implemented, then when an address translation instruction that applies to an Exception level lower than EL3 is executed, the Effective value of SCR_EL3.{NSE, NS} determines the target Security state that the instruction applies to. For AArch32, some instructions can access NonSecure space from Secure, so we still need to pass the state explicitly to do_ats_write(). Signed-off-by: Jean-Philippe Brucker Reviewed-by: Peter Maydell --- target/arm/internals.h | 18 +- target/arm/helper.c| 27 --- target/arm/ptw.c | 12 ++-- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/target/arm/internals.h b/target/arm/internals.h index fc90c364f7..cf13bb94f5 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1217,24 +1217,24 @@ bool get_phys_addr(CPUARMState *env, target_ulong address, __attribute__((nonnull)); /** - * get_phys_addr_with_secure_nogpc: get the physical address for a virtual - * address + * get_phys_addr_with_space_nogpc: get the physical address for a virtual + * address * @env: CPUARMState * @address: virtual address to get physical address for * @access_type: 0 for read, 1 for write, 2 for execute * @mmu_idx: MMU index indicating required translation regime - * @is_secure: security state for the access + * @space: security space for the access * @result: set on translation success. * @fi: set to fault info if the translation fails * - * Similar to get_phys_addr, but use the given security regime and don't perform + * Similar to get_phys_addr, but use the given security space and don't perform * a Granule Protection Check on the resulting address. */ -bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address, - MMUAccessType access_type, - ARMMMUIdx mmu_idx, bool is_secure, - GetPhysAddrResult *result, - ARMMMUFaultInfo *fi) +bool get_phys_addr_with_space_nogpc(CPUARMState *env, target_ulong address, +MMUAccessType access_type, +ARMMMUIdx mmu_idx, ARMSecuritySpace space, +GetPhysAddrResult *result, +ARMMMUFaultInfo *fi) __attribute__((nonnull)); bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, diff --git a/target/arm/helper.c b/target/arm/helper.c index 427de6bd2a..fbb03c364b 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -3357,7 +3357,7 @@ static int par_el1_shareability(GetPhysAddrResult *res) static uint64_t do_ats_write(CPUARMState *env, uint64_t value, MMUAccessType access_type, ARMMMUIdx mmu_idx, - bool is_secure) + ARMSecuritySpace ss) { bool ret; uint64_t par64; @@ -3369,8 +3369,8 @@ static uint64_t do_ats_write(CPUARMState *env, uint64_t value, * I_MXTJT: Granule protection checks are not performed on the final address * of a successful translation. */ -ret = get_phys_addr_with_secure_nogpc(env, value, access_type, mmu_idx, - is_secure, &res, &fi); +ret = get_phys_addr_with_space_nogpc(env, value, access_type, mmu_idx, ss, + &res, &fi); /* * ATS operations only do S1 or S1+S2 translations, so we never @@ -3535,7 +3535,7 @@ static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) uint64_t par64; ARMMMUIdx mmu_idx; int el = arm_current_el(env); -bool secure = arm_is_secure_below_el3(env); +ARMSecuritySpace ss = arm_security_space(env); switch (ri->opc2 & 6) { case 0: @@ -3543,10 +3543,9 @@ static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) switch (el) { case 3: mmu_idx = ARMMMUIdx_E3; -secure = true; break; case 2: -g_assert(!secure); /* ARMv8.4-SecEL2 is 64-bit only */ +g_assert(ss != ARMSS_Secure); /* ARMv8.4-SecEL2 is 64-bit only */ /* fall through */ case 1: if (ri->crm == 9 && (env->uncached_cpsr & CPSR_PAN)) { @@ -3564,10 +3563,9 @@ static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) switch (el) { case 3: mmu_idx = ARMMMUIdx_E10_0; -secure = true; break; case 2: -
[PATCH v2 2/6] target/arm/helper: Fix tlbmask and tlbbits for TLBI VAE2*
When HCR_EL2.E2H is enabled, TLB entries are formed using the EL2&0 translation regime, instead of the EL2 translation regime. The TLB VAE2* instructions invalidate the regime that corresponds to the current value of HCR_EL2.E2H. At the moment we only invalidate the EL2 translation regime. This causes problems with RMM, which issues TLBI VAE2IS instructions with HCR_EL2.E2H enabled. Update vae2_tlbmask() to take HCR_EL2.E2H into account. Add vae2_tlbbits() as well, since the top-byte-ignore configuration is different between the EL2&0 and EL2 regime. Signed-off-by: Jean-Philippe Brucker --- target/arm/helper.c | 50 - 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/target/arm/helper.c b/target/arm/helper.c index 2959d27543..a4c2c1bde5 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -4663,6 +4663,21 @@ static int vae1_tlbmask(CPUARMState *env) return mask; } +static int vae2_tlbmask(CPUARMState *env) +{ +uint64_t hcr = arm_hcr_el2_eff(env); +uint16_t mask; + +if (hcr & HCR_E2H) { +mask = ARMMMUIdxBit_E20_2 | + ARMMMUIdxBit_E20_2_PAN | + ARMMMUIdxBit_E20_0; +} else { +mask = ARMMMUIdxBit_E2; +} +return mask; +} + /* Return 56 if TBI is enabled, 64 otherwise. */ static int tlbbits_for_regime(CPUARMState *env, ARMMMUIdx mmu_idx, uint64_t addr) @@ -4689,6 +4704,25 @@ static int vae1_tlbbits(CPUARMState *env, uint64_t addr) return tlbbits_for_regime(env, mmu_idx, addr); } +static int vae2_tlbbits(CPUARMState *env, uint64_t addr) +{ +uint64_t hcr = arm_hcr_el2_eff(env); +ARMMMUIdx mmu_idx; + +/* + * Only the regime of the mmu_idx below is significant. + * Regime EL2&0 has two ranges with separate TBI configuration, while EL2 + * only has one. + */ +if (hcr & HCR_E2H) { +mmu_idx = ARMMMUIdx_E20_2; +} else { +mmu_idx = ARMMMUIdx_E2; +} + +return tlbbits_for_regime(env, mmu_idx, addr); +} + static void tlbi_aa64_vmalle1is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { @@ -4781,10 +4815,11 @@ static void tlbi_aa64_vae2_write(CPUARMState *env, const ARMCPRegInfo *ri, * flush-last-level-only. */ CPUState *cs = env_cpu(env); -int mask = e2_tlbmask(env); +int mask = vae2_tlbmask(env); uint64_t pageaddr = sextract64(value << 12, 0, 56); +int bits = vae2_tlbbits(env, pageaddr); -tlb_flush_page_by_mmuidx(cs, pageaddr, mask); +tlb_flush_page_bits_by_mmuidx(cs, pageaddr, mask, bits); } static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -4838,11 +4873,11 @@ static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { CPUState *cs = env_cpu(env); +int mask = vae2_tlbmask(env); uint64_t pageaddr = sextract64(value << 12, 0, 56); -int bits = tlbbits_for_regime(env, ARMMMUIdx_E2, pageaddr); +int bits = vae2_tlbbits(env, pageaddr); -tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, - ARMMMUIdxBit_E2, bits); +tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, mask, bits); } static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -5014,11 +5049,6 @@ static void tlbi_aa64_rvae1is_write(CPUARMState *env, do_rvae_write(env, value, vae1_tlbmask(env), true); } -static int vae2_tlbmask(CPUARMState *env) -{ -return ARMMMUIdxBit_E2; -} - static void tlbi_aa64_rvae2_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) -- 2.41.0
[PATCH v2 6/6] target/arm/helper: Implement CNTHCTL_EL2.CNT[VP]MASK
When FEAT_RME is implemented, these bits override the value of CNT[VP]_CTL_EL0.IMASK in Realm and Root state. Move the IRQ state update into a new gt_update_irq() function and test those bits every time we recompute the IRQ state. Since we're removing the IRQ state from some trace events, add a new trace event for gt_update_irq(). Signed-off-by: Jean-Philippe Brucker --- target/arm/cpu.h| 3 +++ target/arm/helper.c | 54 - target/arm/trace-events | 7 +++--- 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/target/arm/cpu.h b/target/arm/cpu.h index bcd65a63ca..bedc7ec6dc 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -1743,6 +1743,9 @@ static inline void xpsr_write(CPUARMState *env, uint32_t val, uint32_t mask) #define HSTR_TTEE (1 << 16) #define HSTR_TJDBX (1 << 17) +#define CNTHCTL_CNTVMASK (1 << 18) +#define CNTHCTL_CNTPMASK (1 << 19) + /* Return the current FPSCR value. */ uint32_t vfp_get_fpscr(CPUARMState *env); void vfp_set_fpscr(CPUARMState *env, uint32_t val); diff --git a/target/arm/helper.c b/target/arm/helper.c index 77dd80ad28..68e915ddda 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -2608,6 +2608,28 @@ static uint64_t gt_get_countervalue(CPUARMState *env) return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / gt_cntfrq_period_ns(cpu); } +static void gt_update_irq(ARMCPU *cpu, int timeridx) +{ +CPUARMState *env = &cpu->env; +uint64_t cnthctl = env->cp15.cnthctl_el2; +ARMSecuritySpace ss = arm_security_space(env); +/* ISTATUS && !IMASK */ +int irqstate = (env->cp15.c14_timer[timeridx].ctl & 6) == 4; + +/* + * If bit CNTHCTL_EL2.CNT[VP]MASK is set, it overrides IMASK. + * It is RES0 in Secure and NonSecure state. + */ +if ((ss == ARMSS_Root || ss == ARMSS_Realm) && +((timeridx == GTIMER_VIRT && (cnthctl & CNTHCTL_CNTVMASK)) || + (timeridx == GTIMER_PHYS && (cnthctl & CNTHCTL_CNTPMASK { +irqstate = 0; +} + +qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate); +trace_arm_gt_update_irq(timeridx, irqstate); +} + static void gt_recalc_timer(ARMCPU *cpu, int timeridx) { ARMGenericTimer *gt = &cpu->env.cp15.c14_timer[timeridx]; @@ -2623,13 +2645,9 @@ static void gt_recalc_timer(ARMCPU *cpu, int timeridx) /* Note that this must be unsigned 64 bit arithmetic: */ int istatus = count - offset >= gt->cval; uint64_t nexttick; -int irqstate; gt->ctl = deposit32(gt->ctl, 2, 1, istatus); -irqstate = (istatus && !(gt->ctl & 2)); -qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate); - if (istatus) { /* Next transition is when count rolls back over to zero */ nexttick = UINT64_MAX; @@ -2648,14 +2666,14 @@ static void gt_recalc_timer(ARMCPU *cpu, int timeridx) } else { timer_mod(cpu->gt_timer[timeridx], nexttick); } -trace_arm_gt_recalc(timeridx, irqstate, nexttick); +trace_arm_gt_recalc(timeridx, nexttick); } else { /* Timer disabled: ISTATUS and timer output always clear */ gt->ctl &= ~4; -qemu_set_irq(cpu->gt_timer_outputs[timeridx], 0); timer_del(cpu->gt_timer[timeridx]); trace_arm_gt_recalc_disabled(timeridx); } +gt_update_irq(cpu, timeridx); } static void gt_timer_reset(CPUARMState *env, const ARMCPRegInfo *ri, @@ -2759,10 +2777,8 @@ static void gt_ctl_write(CPUARMState *env, const ARMCPRegInfo *ri, * IMASK toggled: don't need to recalculate, * just set the interrupt line based on ISTATUS */ -int irqstate = (oldval & 4) && !(value & 2); - -trace_arm_gt_imask_toggle(timeridx, irqstate); -qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate); +trace_arm_gt_imask_toggle(timeridx); +gt_update_irq(cpu, timeridx); } } @@ -2888,6 +2904,21 @@ static void gt_virt_ctl_write(CPUARMState *env, const ARMCPRegInfo *ri, gt_ctl_write(env, ri, GTIMER_VIRT, value); } +static void gt_cnthctl_write(CPUARMState *env, const ARMCPRegInfo *ri, + uint64_t value) +{ +ARMCPU *cpu = env_archcpu(env); +uint32_t oldval = env->cp15.cnthctl_el2; + +raw_write(env, ri, value); + +if ((oldval ^ value) & CNTHCTL_CNTVMASK) { +gt_update_irq(cpu, GTIMER_VIRT); +} else if ((oldval ^ value) & CNTHCTL_CNTPMASK) { +gt_update_irq(cpu, GTIMER_PHYS); +} +} + static void gt_cntvoff_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) { @@ -6200,7 +6231,8 @@ static const ARMCPRegInfo el2_cp_reginfo[] = { * reset values as IMPDEF. We choose to reset to 3
[PATCH v2 3/6] target/arm: Skip granule protection checks for AT instructions
GPC checks are not performed on the output address for AT instructions, as stated by ARM DDI 0487J in D8.12.2: When populating PAR_EL1 with the result of an address translation instruction, granule protection checks are not performed on the final output address of a successful translation. Rename get_phys_addr_with_secure(), since it's only used to handle AT instructions. Signed-off-by: Jean-Philippe Brucker Reviewed-by: Peter Maydell --- target/arm/internals.h | 25 ++--- target/arm/helper.c| 8 ++-- target/arm/ptw.c | 11 ++- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/target/arm/internals.h b/target/arm/internals.h index 0f01bc32a8..fc90c364f7 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1190,12 +1190,11 @@ typedef struct GetPhysAddrResult { } GetPhysAddrResult; /** - * get_phys_addr_with_secure: get the physical address for a virtual address + * get_phys_addr: get the physical address for a virtual address * @env: CPUARMState * @address: virtual address to get physical address for * @access_type: 0 for read, 1 for write, 2 for execute * @mmu_idx: MMU index indicating required translation regime - * @is_secure: security state for the access * @result: set on translation success. * @fi: set to fault info if the translation fails * @@ -1212,26 +1211,30 @@ typedef struct GetPhysAddrResult { * * for PSMAv5 based systems we don't bother to return a full FSR format *value. */ -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address, - MMUAccessType access_type, - ARMMMUIdx mmu_idx, bool is_secure, - GetPhysAddrResult *result, ARMMMUFaultInfo *fi) +bool get_phys_addr(CPUARMState *env, target_ulong address, + MMUAccessType access_type, ARMMMUIdx mmu_idx, + GetPhysAddrResult *result, ARMMMUFaultInfo *fi) __attribute__((nonnull)); /** - * get_phys_addr: get the physical address for a virtual address + * get_phys_addr_with_secure_nogpc: get the physical address for a virtual + * address * @env: CPUARMState * @address: virtual address to get physical address for * @access_type: 0 for read, 1 for write, 2 for execute * @mmu_idx: MMU index indicating required translation regime + * @is_secure: security state for the access * @result: set on translation success. * @fi: set to fault info if the translation fails * - * Similarly, but use the security regime of @mmu_idx. + * Similar to get_phys_addr, but use the given security regime and don't perform + * a Granule Protection Check on the resulting address. */ -bool get_phys_addr(CPUARMState *env, target_ulong address, - MMUAccessType access_type, ARMMMUIdx mmu_idx, - GetPhysAddrResult *result, ARMMMUFaultInfo *fi) +bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address, + MMUAccessType access_type, + ARMMMUIdx mmu_idx, bool is_secure, + GetPhysAddrResult *result, + ARMMMUFaultInfo *fi) __attribute__((nonnull)); bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address, diff --git a/target/arm/helper.c b/target/arm/helper.c index a4c2c1bde5..427de6bd2a 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -3365,8 +3365,12 @@ static uint64_t do_ats_write(CPUARMState *env, uint64_t value, ARMMMUFaultInfo fi = {}; GetPhysAddrResult res = {}; -ret = get_phys_addr_with_secure(env, value, access_type, mmu_idx, -is_secure, &res, &fi); +/* + * I_MXTJT: Granule protection checks are not performed on the final address + * of a successful translation. + */ +ret = get_phys_addr_with_secure_nogpc(env, value, access_type, mmu_idx, + is_secure, &res, &fi); /* * ATS operations only do S1 or S1+S2 translations, so we never diff --git a/target/arm/ptw.c b/target/arm/ptw.c index 063adbd84a..33179f3471 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -3418,16 +3418,17 @@ static bool get_phys_addr_gpc(CPUARMState *env, S1Translate *ptw, return false; } -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address, - MMUAccessType access_type, ARMMMUIdx mmu_idx, - bool is_secure, GetPhysAddrResult *result, - ARMMMUFaultInfo *fi) +bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address, + MMUAccessType access_type, + ARMMMUIdx mmu_idx, bool is_secure, +
[PATCH v2 1/6] target/arm/ptw: Load stage-2 tables from realm physical space
In realm state, stage-2 translation tables are fetched from the realm physical address space (R_PGRQD). Signed-off-by: Jean-Philippe Brucker --- target/arm/ptw.c | 26 ++ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/target/arm/ptw.c b/target/arm/ptw.c index d1de934702..063adbd84a 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -157,22 +157,32 @@ static ARMMMUIdx ptw_idx_for_stage_2(CPUARMState *env, ARMMMUIdx stage2idx) /* * We're OK to check the current state of the CPU here because - * (1) we always invalidate all TLBs when the SCR_EL3.NS bit changes + * (1) we always invalidate all TLBs when the SCR_EL3.NS or SCR_EL3.NSE bit + * changes. * (2) there's no way to do a lookup that cares about Stage 2 for a * different security state to the current one for AArch64, and AArch32 * never has a secure EL2. (AArch32 ATS12NSO[UP][RW] allow EL3 to do * an NS stage 1+2 lookup while the NS bit is 0.) */ -if (!arm_is_secure_below_el3(env) || !arm_el_is_aa64(env, 3)) { +if (!arm_el_is_aa64(env, 3)) { return ARMMMUIdx_Phys_NS; } -if (stage2idx == ARMMMUIdx_Stage2_S) { -s2walk_secure = !(env->cp15.vstcr_el2 & VSTCR_SW); -} else { -s2walk_secure = !(env->cp15.vtcr_el2 & VTCR_NSW); -} -return s2walk_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS; +switch (arm_security_space_below_el3(env)) { +case ARMSS_NonSecure: +return ARMMMUIdx_Phys_NS; +case ARMSS_Realm: +return ARMMMUIdx_Phys_Realm; +case ARMSS_Secure: +if (stage2idx == ARMMMUIdx_Stage2_S) { +s2walk_secure = !(env->cp15.vstcr_el2 & VSTCR_SW); +} else { +s2walk_secure = !(env->cp15.vtcr_el2 & VTCR_NSW); +} +return s2walk_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS; +default: +g_assert_not_reached(); +} } static bool regime_translation_big_endian(CPUARMState *env, ARMMMUIdx mmu_idx) -- 2.41.0
[PATCH v2 5/6] target/arm/helper: Check SCR_EL3.{NSE, NS} encoding for AT instructions
The AT instruction is UNDEFINED if the {NSE,NS} configuration is invalid. Add a function to check this on all AT instructions that apply to an EL lower than 3. Suggested-by: Peter Maydell Signed-off-by: Jean-Philippe Brucker --- target/arm/helper.c | 36 +--- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/target/arm/helper.c b/target/arm/helper.c index fbb03c364b..77dd80ad28 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -3616,6 +3616,20 @@ static void ats1h_write(CPUARMState *env, const ARMCPRegInfo *ri, #endif /* CONFIG_TCG */ } +static CPAccessResult at_e012_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ +/* + * R_NYXTL: instruction is UNDEFINED if it applies to an Exception level + * lower than EL3 and the combination SCR_EL3.{NSE,NS} is reserved. + */ +if (cpu_isar_feature(aa64_rme, env_archcpu(env)) && +(env->cp15.scr_el3 & (SCR_NSE | SCR_NS)) == SCR_NSE) { +return CP_ACCESS_TRAP; +} +return CP_ACCESS_OK; +} + static CPAccessResult at_s1e2_access(CPUARMState *env, const ARMCPRegInfo *ri, bool isread) { @@ -3623,7 +3637,7 @@ static CPAccessResult at_s1e2_access(CPUARMState *env, const ARMCPRegInfo *ri, !(env->cp15.scr_el3 & (SCR_NS | SCR_EEL2))) { return CP_ACCESS_TRAP; } -return CP_ACCESS_OK; +return at_e012_access(env, ri, isread); } static void ats_write64(CPUARMState *env, const ARMCPRegInfo *ri, @@ -5505,38 +5519,38 @@ static const ARMCPRegInfo v8_cp_reginfo[] = { .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 0, .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, .fgt = FGT_ATS1E1R, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, { .name = "AT_S1E1W", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 1, .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, .fgt = FGT_ATS1E1W, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, { .name = "AT_S1E0R", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 2, .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, .fgt = FGT_ATS1E0R, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, { .name = "AT_S1E0W", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 3, .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, .fgt = FGT_ATS1E0W, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, { .name = "AT_S12E1R", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 4, .access = PL2_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, { .name = "AT_S12E1W", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 5, .access = PL2_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, { .name = "AT_S12E0R", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 6, .access = PL2_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, { .name = "AT_S12E0W", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 7, .access = PL2_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, /* AT S1E2* are elsewhere as they UNDEF from EL3 if EL2 is not present */ { .name = "AT_S1E3R", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 6, .crn = 7, .crm = 8, .opc2 = 0, @@ -8078,12 +8092,12 @@ static const ARMCPRegInfo ats1e1_reginfo[] = { .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 9, .opc2 = 0, .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, .fgt = FGT_ATS1E1RP, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, { .name = "AT_S1E1WP", .state = ARM_CP_STATE_AA64, .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 9, .opc2 = 1, .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, .fgt = FGT_ATS1E1WP, - .writefn = ats_write64 }, + .accessfn = at_e012_access, .writefn = ats_write64 }, }; static const ARMCPRegInfo ats1cp_reginfo[] = { -- 2.41.0
[PATCH v2 0/6] target/arm: Fixes for RME
A few patches to fix RME support and allow booting a realm guest, based on https://lore.kernel.org/qemu-devel/20230714154648.327466-1-peter.mayd...@linaro.org/ Since v1 I fixed patches 1, 2 and 6 following Peter's comments, and added patch 5. Patch 6 now factors the timer IRQ update into a new function, which is a bit invasive but seems cleaner. v1: https://lore.kernel.org/qemu-devel/20230719153018.1456180-2-jean-phili...@linaro.org/ Jean-Philippe Brucker (6): target/arm/ptw: Load stage-2 tables from realm physical space target/arm/helper: Fix tlbmask and tlbbits for TLBI VAE2* target/arm: Skip granule protection checks for AT instructions target/arm: Pass security space rather than flag for AT instructions target/arm/helper: Check SCR_EL3.{NSE,NS} encoding for AT instructions target/arm/helper: Implement CNTHCTL_EL2.CNT[VP]MASK target/arm/cpu.h| 3 + target/arm/internals.h | 25 +++--- target/arm/helper.c | 171 +--- target/arm/ptw.c| 39 + target/arm/trace-events | 7 +- 5 files changed, 170 insertions(+), 75 deletions(-) -- 2.41.0
Re: [PATCH v2] virtio-iommu: Use qemu_real_host_page_mask as default page_size_mask
On Wed, Feb 21, 2024 at 11:41:57AM +0100, Eric Auger wrote: > Hi, > > On 2/13/24 13:00, Michael S. Tsirkin wrote: > > On Tue, Feb 13, 2024 at 12:24:22PM +0100, Eric Auger wrote: > >> Hi Michael, > >> On 2/13/24 12:09, Michael S. Tsirkin wrote: > >>> On Tue, Feb 13, 2024 at 11:32:13AM +0100, Eric Auger wrote: > Do you have an other concern? > >>> I also worry a bit about migrating between hosts with different > >>> page sizes. Not with kvm I am guessing but with tcg it does work I think? > >> I have never tried but is it a valid use case? Adding Peter in CC. > >>> Is this just for vfio and vdpa? Can we limit this to these setups > >>> maybe? > >> I am afraid we know the actual use case too later. If the VFIO device is > >> hotplugged we have started working with 4kB granule. > >> > >> The other way is to introduce a min_granule option as done for aw-bits. > >> But it is heavier. > >> > >> Thanks > >> > >> Eric > > Let's say, if you are changing the default then we definitely want > > a way to get the cmpatible behaviour for tcg. > > So the compat machinery should be user-accessible too and documented. > > I guess I need to add a new option to guarantee the machine compat. > > I was thinking about an enum GranuleMode property taking the following > values, 4KB, 64KB, host > Jean, do you think there is a rationale offering something richer? 16KB seems to be gaining popularity, we should include that (I think it's the only granule supported by Apple IOMMU?). Hopefully that will be enough. Thanks, Jean > > Obviously being able to set the exact page_size_mask + host mode would > be better but this does not really fit into any std property type. > > Thanks > > Eric > > >
Re: [PATCH 0/3] VIRTIO-IOMMU: Introduce an aw-bits option
Hi Eric, On Tue, Jan 23, 2024 at 07:15:54PM +0100, Eric Auger wrote: > In [1] and [2] we attempted to fix a case where a VFIO-PCI device > protected with a virtio-iommu is assigned to an x86 guest. On x86 > the physical IOMMU may have an address width (gaw) of 39 or 48 bits > whereas the virtio-iommu exposes a 64b input address space by default. > Hence the guest may try to use the full 64b space and DMA MAP > failures may be encountered. To work around this issue we endeavoured > to pass usable host IOVA regions (excluding the out of range space) from > VFIO to the virtio-iommu device so that the virtio-iommu driver can > query those latter during the probe request and let the guest iommu > kernel subsystem carve them out. > > However if there are several devices in the same iommu group, > only the reserved regions of the first one are taken into > account by the iommu subsystem of the guest. This generally > works on baremetal because devices are not going to > expose different reserved regions. However in our case, this > may prevent from taking into account the host iommu geometry. > > So the simplest solution to this problem looks to introduce an > input address width option, aw-bits, which matches what is > done on the intel-iommu. By default, from now on it is set > to 39 bits with pc_q35 and 64b with arm virt. Doesn't Arm have the same problem? The TTB0 page tables limit what can be mapped to 48-bit, or 52-bit when SMMU_IDR5.VAX==1 and granule is 64kB. A Linux host driver could configure smaller VA sizes: * SMMUv2 limits the VA to SMMU_IDR2.UBS (upstream bus size) which can go as low as 32-bit (I'm assuming we don't care about 32-bit hosts). * SMMUv3 currently limits the VA to CONFIG_ARM64_VA_BITS, which could be as low as 36 bits (but realistically 39, since 36 depends on 16kB pages and CONFIG_EXPERT). But 64-bit definitely can't work for VFIO, and I suppose isn't useful for virtual devices, so maybe 39 is also a reasonable default on Arm. Thanks, Jean > This replaces the > previous default value of 64b. So we need to introduce a compat > for pc_q35 machines older than 9.0 to behave similarly.
Re: [PATCH 0/3] VIRTIO-IOMMU: Introduce an aw-bits option
On Mon, Jan 29, 2024 at 03:07:41PM +0100, Eric Auger wrote: > Hi Jean-Philippe, > > On 1/29/24 13:23, Jean-Philippe Brucker wrote: > > Hi Eric, > > > > On Tue, Jan 23, 2024 at 07:15:54PM +0100, Eric Auger wrote: > >> In [1] and [2] we attempted to fix a case where a VFIO-PCI device > >> protected with a virtio-iommu is assigned to an x86 guest. On x86 > >> the physical IOMMU may have an address width (gaw) of 39 or 48 bits > >> whereas the virtio-iommu exposes a 64b input address space by default. > >> Hence the guest may try to use the full 64b space and DMA MAP > >> failures may be encountered. To work around this issue we endeavoured > >> to pass usable host IOVA regions (excluding the out of range space) from > >> VFIO to the virtio-iommu device so that the virtio-iommu driver can > >> query those latter during the probe request and let the guest iommu > >> kernel subsystem carve them out. > >> > >> However if there are several devices in the same iommu group, > >> only the reserved regions of the first one are taken into > >> account by the iommu subsystem of the guest. This generally > >> works on baremetal because devices are not going to > >> expose different reserved regions. However in our case, this > >> may prevent from taking into account the host iommu geometry. > >> > >> So the simplest solution to this problem looks to introduce an > >> input address width option, aw-bits, which matches what is > >> done on the intel-iommu. By default, from now on it is set > >> to 39 bits with pc_q35 and 64b with arm virt. > > Doesn't Arm have the same problem? The TTB0 page tables limit what can be > > mapped to 48-bit, or 52-bit when SMMU_IDR5.VAX==1 and granule is 64kB. > > A Linux host driver could configure smaller VA sizes: > > * SMMUv2 limits the VA to SMMU_IDR2.UBS (upstream bus size) which > > can go as low as 32-bit (I'm assuming we don't care about 32-bit hosts). > Yes I think we can ignore that use case. > > * SMMUv3 currently limits the VA to CONFIG_ARM64_VA_BITS, which > > could be as low as 36 bits (but realistically 39, since 36 depends on > > 16kB pages and CONFIG_EXPERT). > Further reading "3.4.1 Input address size and Virtual Address size" ooks > indeed SMMU_IDR5.VAX gives info on the physical SMMU actual > implementation max (which matches intel iommu gaw). I missed that. Now I > am confused about should we limit VAS to 39 to accomodate of the worst > case host SW configuration or shall we use 48 instead? I don't know what's best either. 48 should be fine if hosts normally enable VA_BITS_48 (I see debian has it [1], not sure how to find the others). [1] https://salsa.debian.org/kernel-team/linux/-/blob/master/debian/config/arm64/config?ref_type=heads#L18 > If we set such a low 39b value, won't it prevent some guests from > properly working? It's not that low, since it gives each endpoint a private 512GB address space, but yes there might be special cases that reach the limit. Maybe assign a multi-queue NIC to a 256-vCPU guest, and if you want per-vCPU DMA pools, then with a 39-bit address space you only get 2GB per vCPU. With 48-bit you get 1TB which should be plenty. 52-bit private IOVA space doesn't seem useful, I doubt we'll ever need to support that on the MAP/UNMAP interface. So I guess 48-bit can be the default, and users with special setups can override aw-bits. Thanks, Jean
Re: [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for hotplugged devices
On Mon, Jan 29, 2024 at 05:38:55PM +0100, Eric Auger wrote: > > There may be a separate argument for clearing bypass. With a coldplugged > > VFIO device the flow is: > > > > 1. Map the whole guest address space in VFIO to implement boot-bypass. > >This allocates all guest pages, which takes a while and is wasteful. > >I've actually crashed a host that way, when spawning a guest with too > >much RAM. > interesting > > 2. Start the VM > > 3. When the virtio-iommu driver attaches a (non-identity) domain to the > >assigned endpoint, then unmap the whole address space in VFIO, and most > >pages are given back to the host. > > > > We can't disable boot-bypass because the BIOS needs it. But instead the > > flow could be: > > > > 1. Start the VM, with only the virtual endpoints. Nothing to pin. > > 2. The virtio-iommu driver disables bypass during boot > We needed this boot-bypass mode for booting with virtio-blk-scsi > protected with virtio-iommu for instance. > That was needed because we don't have any virtio-iommu driver in edk2 as > opposed to intel iommu driver, right? Yes. What I had in mind is the x86 SeaBIOS which doesn't have any IOMMU driver and accesses the default SATA device: $ qemu-system-x86_64 -M q35 -device virtio-iommu,boot-bypass=off qemu: virtio_iommu_translate sid=250 is not known!! qemu: no buffer available in event queue to report event qemu: AHCI: Failed to start FIS receive engine: bad FIS receive buffer address But it's the same problem with edk2. Also a guest OS without a virtio-iommu driver needs boot-bypass. Once firmware boot is complete, the OS with a virtio-iommu driver normally can turn bypass off in the config space, it's not useful anymore. If it needs to put some endpoints in bypass, then it can attach them to a bypass domain. > > 3. Hotplug the VFIO device. With bypass disabled there is no need to pin > >the whole guest address space, unless the guest explicitly asks for an > >identity domain. > > > > However, I don't know if this is a realistic scenario that will actually > > be used. > > > > By the way, do you have an easy way to reproduce the issue described here? > > I've had to enable iommu.forcedac=1 on the command-line, otherwise Linux > > just allocates 32-bit IOVAs. > I don't have a simple generic reproducer. It happens when assigning this > device: > Ethernet Controller E810-C for QSFP (Ethernet Network Adapter E810-C-Q2) > > I have not encountered that issue with another device yet. > I see on guest side in dmesg: > [ 6.849292] ice :00:05.0: Using 64-bit DMA addresses > > That's emitted in dma-iommu.c iommu_dma_alloc_iova(). > Looks like the guest first tries to allocate an iova in the 32-bit AS > and if this fails use the whole dma_limit. > Seems the 32b IOVA alloc failed here ;-) Interesting, are you running some demanding workload and a lot of CPUs? That's a lot of IOVAs used up, I'm curious about what kind of DMA pattern does that. Thanks, Jean
Re: [PATCH v2 1/3] virtio-iommu: Add an option to define the input range width
Hi Eric, On Thu, Feb 01, 2024 at 05:32:22PM +0100, Eric Auger wrote: > aw-bits is a new option that allows to set the bit width of > the input address range. This value will be used as a default for > the device config input_range.end. By default it is set to 64 bits > which is the current value. > > Signed-off-by: Eric Auger > > --- > > v1 -> v2: > - Check the aw-bits value is within [32,64] > --- > include/hw/virtio/virtio-iommu.h | 1 + > hw/virtio/virtio-iommu.c | 7 ++- > 2 files changed, 7 insertions(+), 1 deletion(-) > > diff --git a/include/hw/virtio/virtio-iommu.h > b/include/hw/virtio/virtio-iommu.h > index 781ebaea8f..5fbe4677c2 100644 > --- a/include/hw/virtio/virtio-iommu.h > +++ b/include/hw/virtio/virtio-iommu.h > @@ -66,6 +66,7 @@ struct VirtIOIOMMU { > bool boot_bypass; > Notifier machine_done; > bool granule_frozen; > +uint8_t aw_bits; > }; > > #endif > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > index ec2ba11d1d..7870bdbeee 100644 > --- a/hw/virtio/virtio-iommu.c > +++ b/hw/virtio/virtio-iommu.c > @@ -1314,7 +1314,11 @@ static void virtio_iommu_device_realize(DeviceState > *dev, Error **errp) > */ > s->config.bypass = s->boot_bypass; > s->config.page_size_mask = qemu_real_host_page_mask(); > -s->config.input_range.end = UINT64_MAX; > +if (s->aw_bits < 32 || s->aw_bits > 64) { I'm wondering if we should lower this to 16 bits, just to support all possible host SMMU configurations (the smallest address space configurable with T0SZ is 25-bit, or 16-bit with the STT extension). Thanks, Jean > +error_setg(errp, "aw-bits must be within [32,64]"); > +} > +s->config.input_range.end = > +s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1; > s->config.domain_range.end = UINT32_MAX; > s->config.probe_size = VIOMMU_PROBE_SIZE; > > @@ -1525,6 +1529,7 @@ static Property virtio_iommu_properties[] = { > DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, > TYPE_PCI_BUS, PCIBus *), > DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), > +DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64), > DEFINE_PROP_END_OF_LIST(), > }; > > -- > 2.41.0 >
Re: [PATCH v2 1/3] virtio-iommu: Add an option to define the input range width
On Thu, Feb 08, 2024 at 09:16:35AM +0100, Eric Auger wrote: > >> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > >> index ec2ba11d1d..7870bdbeee 100644 > >> --- a/hw/virtio/virtio-iommu.c > >> +++ b/hw/virtio/virtio-iommu.c > >> @@ -1314,7 +1314,11 @@ static void virtio_iommu_device_realize(DeviceState > >> *dev, Error **errp) > >> */ > >> s->config.bypass = s->boot_bypass; > >> s->config.page_size_mask = qemu_real_host_page_mask(); > >> -s->config.input_range.end = UINT64_MAX; > >> +if (s->aw_bits < 32 || s->aw_bits > 64) { > > I'm wondering if we should lower this to 16 bits, just to support all > > possible host SMMU configurations (the smallest address space configurable > > with T0SZ is 25-bit, or 16-bit with the STT extension). > Is it a valid use case case to assign host devices protected by > virtio-iommu with a physical SMMU featuring Small Translation Table? Probably not, I'm guessing STT is for tiny embedded implementations where QEMU or even virtualization is not a use case. But because smaller mobile platforms now implement SMMUv3, using smaller IOVA spaces and thus fewer page tables can be beneficial. One use case I have in mind is android with pKVM, each app has its own VM, and devices can be partitioned into lots of address spaces with PASID, so you can save a lot of memory and table-walk time by shrinking those address space. But that particular case will use crosvm so isn't relevant here, it's only an example. Mainly I was concerned that if the Linux driver decides to allow configuring smaller address spaces (maybe a linux cmdline option), then using a architectural limit here would be a safe bet that things can still work. But we can always change it in a later version, or implement finer controls (ideally the guest driver would configure the VA size in ATTACH). > It leaves 64kB IOVA space only. Besides in the spec, it is wriiten the > min T0SZ can even be 12. > > "The minimum valid value is 16 unless all of the following also hold, in > which case the minimum permitted > value is 12: > – SMMUv3.1 or later is supported. > – SMMU_IDR5.VAX indicates support for 52-bit Vas. > – The corresponding CD.TGx selects a 64KB granule. > " Yes that's confusing because va_size = 64 - T0SZ, so T0SZ=12 actually describes the largest address size, 52. > > At the moment I would prefer to stick to the limit suggested by Alex > which looks also sensible for other archs whereas 16 doesn't. Agreed, it should be sufficient. Thanks, Jean
Re: [PATCH v3 0/3] VIRTIO-IOMMU: Introduce an aw-bits option
On Thu, Feb 08, 2024 at 11:10:16AM +0100, Eric Auger wrote: > In [1] and [2] we attempted to fix a case where a VFIO-PCI device > protected with a virtio-iommu is assigned to an x86 guest. On x86 > the physical IOMMU may have an address width (gaw) of 39 or 48 bits > whereas the virtio-iommu exposes a 64b input address space by default. > Hence the guest may try to use the full 64b space and DMA MAP > failures may be encountered. To work around this issue we endeavoured > to pass usable host IOVA regions (excluding the out of range space) from > VFIO to the virtio-iommu device so that the virtio-iommu driver can > query those latter during the probe request and let the guest iommu > kernel subsystem carve them out. > > However if there are several devices in the same iommu group, > only the reserved regions of the first one are taken into > account by the iommu subsystem of the guest. This generally > works on baremetal because devices are not going to > expose different reserved regions. However in our case, this > may prevent from taking into account the host iommu geometry. > > So the simplest solution to this problem looks to introduce an > input address width option, aw-bits, which matches what is > done on the intel-iommu. By default, from now on it is set > to 39 bits with pc_q35 and 48 with arm virt. This replaces the > previous default value of 64b. So we need to introduce a compat > for machines older than 9.0 to behave similarly. We use > hw_compat_8_2 to acheive that goal. For the series: Reviewed-by: Jean-Philippe Brucker > > Outstanding series [2] remains useful to let resv regions beeing > communicated on time before the probe request. > > [1] [PATCH v4 00/12] VIRTIO-IOMMU/VFIO: Don't assume 64b IOVA space > https://lore.kernel.org/all/20231019134651.842175-1-eric.au...@redhat.com/ > - This is merged - > > [2] [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for > hotplugged devices > https://lore.kernel.org/all/20240117080414.316890-1-eric.au...@redhat.com/ > - This is pending for review on the ML - > > This series can be found at: > https://github.com/eauger/qemu/tree/virtio-iommu-aw-bits-v3 > previous > https://github.com/eauger/qemu/tree/virtio-iommu-aw-bits-v2 > > Applied on top of [3] > [PATCH v2] virtio-iommu: Use qemu_real_host_page_mask as default > page_size_mask > https://lore.kernel.org/all/20240117132039.332273-1-eric.au...@redhat.com/ > > History: > v2 -> v3: > - Collected Zhenzhong and Cédric's R-b + Yanghang's T-b > - use &error_abort instead of NULL error handle > on object_property_get_uint() call (Cédric) > - use VTD_HOST_AW_39BIT (Cédric) > > v1 -> v2 > - Limit aw to 48b on ARM > - Check aw is within [32,64] > - Use hw_compat_8_2 > > > Eric Auger (3): > virtio-iommu: Add an option to define the input range width > virtio-iommu: Trace domain range limits as unsigned int > hw: Set virtio-iommu aw-bits default value on pc_q35 and arm virt > > include/hw/virtio/virtio-iommu.h | 1 + > hw/arm/virt.c| 6 ++ > hw/core/machine.c| 5 - > hw/i386/pc.c | 6 ++ > hw/virtio/virtio-iommu.c | 7 ++- > hw/virtio/trace-events | 2 +- > 6 files changed, 24 insertions(+), 3 deletions(-) > > -- > 2.41.0 >
[PATCH v2 08/22] target/arm/kvm: Split kvm_arch_get/put_registers
The confidential guest support in KVM limits the number of registers that we can read and write. Split the get/put_registers function to prepare for it. Signed-off-by: Jean-Philippe Brucker --- target/arm/kvm.c | 30 -- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index b00077c1a5..3504276822 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -2056,7 +2056,7 @@ static int kvm_arch_put_sve(CPUState *cs) return 0; } -int kvm_arch_put_registers(CPUState *cs, int level) +static int kvm_arm_put_core_regs(CPUState *cs, int level) { uint64_t val; uint32_t fpr; @@ -2159,6 +2159,19 @@ int kvm_arch_put_registers(CPUState *cs, int level) return ret; } +return 0; +} + +int kvm_arch_put_registers(CPUState *cs, int level) +{ +int ret; +ARMCPU *cpu = ARM_CPU(cs); + +ret = kvm_arm_put_core_regs(cs, level); +if (ret) { +return ret; +} + write_cpustate_to_list(cpu, true); if (!write_list_to_kvmstate(cpu, level)) { @@ -2240,7 +2253,7 @@ static int kvm_arch_get_sve(CPUState *cs) return 0; } -int kvm_arch_get_registers(CPUState *cs) +static int kvm_arm_get_core_regs(CPUState *cs) { uint64_t val; unsigned int el; @@ -2343,6 +2356,19 @@ int kvm_arch_get_registers(CPUState *cs) } vfp_set_fpcr(env, fpr); +return 0; +} + +int kvm_arch_get_registers(CPUState *cs) +{ +int ret; +ARMCPU *cpu = ARM_CPU(cs); + +ret = kvm_arm_get_core_regs(cs); +if (ret) { +return ret; +} + ret = kvm_get_vcpu_events(cpu); if (ret) { return ret; -- 2.44.0
[PATCH v2 02/22] target/arm: Add confidential guest support
Add a new RmeGuest object, inheriting from ConfidentialGuestSupport, to support the Arm Realm Management Extension (RME). It is instantiated by passing on the command-line: -M virt,confidential-guest-support= -object guest-rme,id=[,options...] This is only the skeleton. Support will be added in following patches. Cc: Eric Blake Cc: Markus Armbruster Cc: Daniel P. Berrangé Cc: Eduardo Habkost Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Jean-Philippe Brucker --- docs/system/confidential-guest-support.rst | 1 + qapi/qom.json | 3 +- target/arm/kvm-rme.c | 46 ++ target/arm/meson.build | 7 +++- 4 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 target/arm/kvm-rme.c diff --git a/docs/system/confidential-guest-support.rst b/docs/system/confidential-guest-support.rst index 0c490dbda2..acf46d8856 100644 --- a/docs/system/confidential-guest-support.rst +++ b/docs/system/confidential-guest-support.rst @@ -40,5 +40,6 @@ Currently supported confidential guest mechanisms are: * AMD Secure Encrypted Virtualization (SEV) (see :doc:`i386/amd-memory-encryption`) * POWER Protected Execution Facility (PEF) (see :ref:`power-papr-protected-execution-facility-pef`) * s390x Protected Virtualization (PV) (see :doc:`s390x/protvirt`) +* Arm Realm Management Extension (RME) Other mechanisms may be supported in future. diff --git a/qapi/qom.json b/qapi/qom.json index 85e6b4f84a..623ec8071f 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -996,7 +996,8 @@ 'tls-creds-x509', 'tls-cipher-suites', { 'name': 'x-remote-object', 'features': [ 'unstable' ] }, -{ 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] } +{ 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] }, +'rme-guest' ] } ## diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c new file mode 100644 index 00..960dd75608 --- /dev/null +++ b/target/arm/kvm-rme.c @@ -0,0 +1,46 @@ +/* + * QEMU Arm RME support + * + * Copyright Linaro 2024 + */ + +#include "qemu/osdep.h" + +#include "exec/confidential-guest-support.h" +#include "hw/boards.h" +#include "hw/core/cpu.h" +#include "kvm_arm.h" +#include "migration/blocker.h" +#include "qapi/error.h" +#include "qom/object_interfaces.h" +#include "sysemu/kvm.h" +#include "sysemu/runstate.h" + +#define TYPE_RME_GUEST "rme-guest" +OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) + +struct RmeGuest { +ConfidentialGuestSupport parent_obj; +}; + +static void rme_guest_class_init(ObjectClass *oc, void *data) +{ +} + +static const TypeInfo rme_guest_info = { +.parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, +.name = TYPE_RME_GUEST, +.instance_size = sizeof(struct RmeGuest), +.class_init = rme_guest_class_init, +.interfaces = (InterfaceInfo[]) { +{ TYPE_USER_CREATABLE }, +{ } +} +}; + +static void rme_register_types(void) +{ +type_register_static(&rme_guest_info); +} + +type_init(rme_register_types); diff --git a/target/arm/meson.build b/target/arm/meson.build index 2e10464dbb..c610c078f7 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -8,7 +8,12 @@ arm_ss.add(files( )) arm_ss.add(zlib) -arm_ss.add(when: 'CONFIG_KVM', if_true: files('hyp_gdbstub.c', 'kvm.c'), if_false: files('kvm-stub.c')) +arm_ss.add(when: 'CONFIG_KVM', + if_true: files( +'hyp_gdbstub.c', +'kvm.c', +'kvm-rme.c'), + if_false: files('kvm-stub.c')) arm_ss.add(when: 'CONFIG_HVF', if_true: files('hyp_gdbstub.c')) arm_ss.add(when: 'TARGET_AARCH64', if_true: files( -- 2.44.0
[PATCH v2 07/22] hw/arm/virt: Reserve one bit of guest-physical address for RME
When RME is enabled, the upper GPA bit is used to distinguish protected from unprotected addresses. Reserve it when setting up the guest memory map. Signed-off-by: Jean-Philippe Brucker --- v1->v2: separate patch --- hw/arm/virt.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index f300f100b5..eca9a96b5a 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2939,14 +2939,24 @@ static int virt_kvm_type(MachineState *ms, const char *type_str) VirtMachineState *vms = VIRT_MACHINE(ms); int rme_vm_type = kvm_arm_rme_vm_type(ms); int max_vm_pa_size, requested_pa_size; +int rme_reserve_bit = 0; bool fixed_ipa; -max_vm_pa_size = kvm_arm_get_max_vm_ipa_size(ms, &fixed_ipa); +if (rme_vm_type) { +/* + * With RME, the upper GPA bit differentiates Realm from NS memory. + * Reserve the upper bit to ensure that highmem devices will fit. + */ +rme_reserve_bit = 1; +} + +max_vm_pa_size = kvm_arm_get_max_vm_ipa_size(ms, &fixed_ipa) - + rme_reserve_bit; /* we freeze the memory map to compute the highest gpa */ virt_set_memmap(vms, max_vm_pa_size); -requested_pa_size = 64 - clz64(vms->highest_gpa); +requested_pa_size = 64 - clz64(vms->highest_gpa) + rme_reserve_bit; /* * KVM requires the IPA size to be at least 32 bits. -- 2.44.0
[PATCH v2 10/22] target/arm/kvm: Create scratch VM as Realm if necessary
Some ID registers have a different value for a Realm VM, for example ID_AA64DFR0_EL1 contains the number of breakpoints/watchpoints implemented by RMM instead of the hardware. Even though RMM is in charge of setting up most Realm registers, KVM still provides GET_ONE_REG interface on a Realm VM to probe the VM's capabilities. KVM only reports the maximum IPA it supports, but RMM may support smaller sizes. If the VM creation fails with the value returned by KVM, then retry with the smaller working address. This needs a better solution. Signed-off-by: Jean-Philippe Brucker --- target/arm/kvm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 3a2233ec73..6d368bf442 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -104,6 +104,7 @@ bool kvm_arm_create_scratch_host_vcpu(const uint32_t *cpus_to_try, { int ret = 0, kvmfd = -1, vmfd = -1, cpufd = -1; int max_vm_pa_size; +int vm_type; kvmfd = qemu_open_old("/dev/kvm", O_RDWR); if (kvmfd < 0) { @@ -113,8 +114,9 @@ bool kvm_arm_create_scratch_host_vcpu(const uint32_t *cpus_to_try, if (max_vm_pa_size < 0) { max_vm_pa_size = 0; } +vm_type = kvm_arm_rme_vm_type(MACHINE(qdev_get_machine())); do { -vmfd = ioctl(kvmfd, KVM_CREATE_VM, max_vm_pa_size); +vmfd = ioctl(kvmfd, KVM_CREATE_VM, max_vm_pa_size | vm_type); } while (vmfd == -1 && errno == EINTR); if (vmfd < 0) { goto err; -- 2.44.0
[PATCH v2 05/22] hw/arm/virt: Add support for Arm RME
When confidential-guest-support is enabled for the virt machine, call the RME init function, and add the RME flag to the VM type. Signed-off-by: Jean-Philippe Brucker --- v1->v2: * Don't explicitly disable steal_time, it's now done through KVM capabilities * Split patch --- hw/arm/virt.c | 15 +-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index a9a913aead..07ad31876e 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -224,6 +224,11 @@ static const int a15irqmap[] = { [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ }; +static bool virt_machine_is_confidential(VirtMachineState *vms) +{ +return MACHINE(vms)->cgs; +} + static void create_randomness(MachineState *ms, const char *node) { struct { @@ -2111,10 +2116,11 @@ static void machvirt_init(MachineState *machine) * if the guest has EL2 then we will use SMC as the conduit, * and otherwise we will use HVC (for backwards compatibility and * because if we're using KVM then we must use HVC). + * Realm guests must also use SMC. */ if (vms->secure && firmware_loaded) { vms->psci_conduit = QEMU_PSCI_CONDUIT_DISABLED; -} else if (vms->virt) { +} else if (vms->virt || virt_machine_is_confidential(vms)) { vms->psci_conduit = QEMU_PSCI_CONDUIT_SMC; } else { vms->psci_conduit = QEMU_PSCI_CONDUIT_HVC; @@ -2917,6 +2923,7 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, static int virt_kvm_type(MachineState *ms, const char *type_str) { VirtMachineState *vms = VIRT_MACHINE(ms); +int rme_vm_type = kvm_arm_rme_vm_type(ms); int max_vm_pa_size, requested_pa_size; bool fixed_ipa; @@ -2946,7 +2953,11 @@ static int virt_kvm_type(MachineState *ms, const char *type_str) * the implicit legacy 40b IPA setting, in which case the kvm_type * must be 0. */ -return fixed_ipa ? 0 : requested_pa_size; +if (fixed_ipa) { +return 0; +} + +return requested_pa_size | rme_vm_type; } static void virt_machine_class_init(ObjectClass *oc, void *data) -- 2.44.0
[PATCH v2 14/22] target/arm/kvm-rme: Add Realm Personalization Value parameter
The Realm Personalization Value (RPV) is provided by the user to distinguish Realms that have the same initial measurement. The user provides up to 64 hexadecimal bytes. They are stored into the RPV in the same order, zero-padded on the right. Cc: Eric Blake Cc: Markus Armbruster Cc: Daniel P. Berrangé Cc: Eduardo Habkost Signed-off-by: Jean-Philippe Brucker --- v1->v2: Move parsing early, store as-is rather than reverted --- qapi/qom.json| 15 +- target/arm/kvm-rme.c | 111 +++ 2 files changed, 125 insertions(+), 1 deletion(-) diff --git a/qapi/qom.json b/qapi/qom.json index 623ec8071f..91654aa267 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -931,6 +931,18 @@ 'data': { '*cpu-affinity': ['uint16'], '*node-affinity': ['uint16'] } } +## +# @RmeGuestProperties: +# +# Properties for rme-guest objects. +# +# @personalization-value: Realm personalization value, as a 64-byte hex string +# (default: 0) +# +# Since: FIXME +## +{ 'struct': 'RmeGuestProperties', + 'data': { '*personalization-value': 'str' } } ## # @ObjectType: @@ -1066,7 +1078,8 @@ 'tls-creds-x509': 'TlsCredsX509Properties', 'tls-cipher-suites': 'TlsCredsProperties', 'x-remote-object':'RemoteObjectProperties', - 'x-vfio-user-server': 'VfioUserServerProperties' + 'x-vfio-user-server': 'VfioUserServerProperties', + 'rme-guest': 'RmeGuestProperties' } } ## diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index b2ad10ef6d..cb5c3f7a22 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -23,10 +23,13 @@ OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) #define RME_PAGE_SIZE qemu_real_host_page_size() +#define RME_MAX_CFG 1 + struct RmeGuest { ConfidentialGuestSupport parent_obj; Notifier rom_load_notifier; GSList *ram_regions; +uint8_t *personalization_value; }; typedef struct { @@ -54,6 +57,48 @@ static int rme_create_rd(Error **errp) return ret; } +static int rme_configure_one(RmeGuest *guest, uint32_t cfg, Error **errp) +{ +int ret; +const char *cfg_str; +struct kvm_cap_arm_rme_config_item args = { +.cfg = cfg, +}; + +switch (cfg) { +case KVM_CAP_ARM_RME_CFG_RPV: +if (!guest->personalization_value) { +return 0; +} +memcpy(args.rpv, guest->personalization_value, KVM_CAP_ARM_RME_RPV_SIZE); +cfg_str = "personalization value"; +break; +default: +g_assert_not_reached(); +} + +ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_CONFIG_REALM, (intptr_t)&args); +if (ret) { +error_setg_errno(errp, -ret, "RME: failed to configure %s", cfg_str); +} +return ret; +} + +static int rme_configure(void) +{ +int ret; +int cfg; + +for (cfg = 0; cfg < RME_MAX_CFG; cfg++) { +ret = rme_configure_one(rme_guest, cfg, &error_abort); +if (ret) { +return ret; +} +} +return 0; +} + static void rme_populate_realm(gpointer data, gpointer unused) { int ret; @@ -98,6 +143,11 @@ static void rme_vm_state_change(void *opaque, bool running, RunState state) return; } +ret = rme_configure(); +if (ret) { +return; +} + ret = rme_create_rd(&error_abort); if (ret) { return; @@ -231,8 +281,69 @@ int kvm_arm_rme_vm_type(MachineState *ms) return 0; } +static char *rme_get_rpv(Object *obj, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); +GString *s; +int i; + +if (!guest->personalization_value) { +return NULL; +} + +s = g_string_sized_new(KVM_CAP_ARM_RME_RPV_SIZE * 2 + 1); + +for (i = 0; i < KVM_CAP_ARM_RME_RPV_SIZE; i++) { +g_string_append_printf(s, "%02x", guest->personalization_value[i]); +} + +return g_string_free(s, /* free_segment */ false); +} + +static void rme_set_rpv(Object *obj, const char *value, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); +size_t len = strlen(value); +uint8_t *out; +int i = 1; +int ret; + +g_free(guest->personalization_value); +guest->personalization_value = out = g_malloc0(KVM_CAP_ARM_RME_RPV_SIZE); + +/* Two chars per byte */ +if (len > KVM_CAP_ARM_RME_RPV_SIZE * 2) { +error_setg(errp, "Realm Personalization Value is too large"); +return; +} + +/* First byte may have a single char */ +if (len % 2) { +ret = sscanf(value, "%1hhx", out++); +} else { +ret = ssc
[PATCH v2 01/22] kvm: Merge kvm_check_extension() and kvm_vm_check_extension()
The KVM_CHECK_EXTENSION ioctl can be issued either on the global fd (/dev/kvm), or on the VM fd obtained with KVM_CREATE_VM. For most extensions, KVM returns the same value with either method, but for some of them it can refine the returned value depending on the VM type. The KVM documentation [1] advises to use the VM fd: Based on their initialization different VMs may have different capabilities. It is thus encouraged to use the vm ioctl to query for capabilities (available with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) Ongoing work on Arm confidential VMs confirms this, as some capabilities become unavailable to confidential VMs, requiring changes in QEMU to use kvm_vm_check_extension() instead of kvm_check_extension() [2]. Rather than changing each check one by one, change kvm_check_extension() to always issue the ioctl on the VM fd when available, and remove kvm_vm_check_extension(). Fall back to the global fd when the VM check is unavailable: * Ancient kernels do not support KVM_CHECK_EXTENSION on the VM fd, since it was added by commit 92b591a4c46b ("KVM: Allow KVM_CHECK_EXTENSION on the vm fd") in Linux 3.17 [3]. Support for Linux 3.16 ended in June 2020, but there may still be old images around. * A couple of calls must be issued before the VM fd is available, since they determine the VM type: KVM_CAP_MIPS_VZ and KVM_CAP_ARM_VM_IPA_SIZE Does any user actually depend on the check being done on the global fd instead of the VM fd? I surveyed all cases where KVM presently returns different values depending on the query method. Luckily QEMU already calls kvm_vm_check_extension() for most of those. Only three of them are ambiguous, because currently done on the global fd: * KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID on Arm, changes value if the user requests a vGIC different from the default. But QEMU queries this before vGIC configuration, so the reported value will be the same. * KVM_CAP_SW_TLB on PPC. When issued on the global fd, returns false if the kvm-hv module is loaded; when issued on the VM fd, returns false only if the VM type is HV instead of PR. If this returns false, then QEMU will fail to initialize a BOOKE206 MMU model. So this patch supposedly improves things, as it allows to run this type of vCPU even when both KVM modules are loaded. * KVM_CAP_PPC_SECURE_GUEST. Similarly, doing this check on a VM fd refines the returned value, and ensures that SVM is actually supported. Since QEMU follows the check with kvm_vm_enable_cap(), this patch should only provide better error reporting. [1] https://www.kernel.org/doc/html/latest/virt/kvm/api.html#kvm-check-extension [2] https://lore.kernel.org/kvm/875ybi0ytc@redhat.com/ [3] https://github.com/torvalds/linux/commit/92b591a4c46b Cc: Marcelo Tosatti Cc: Nicholas Piggin Cc: Daniel Henrique Barboza Cc: qemu-...@nongnu.org Suggested-by: Cornelia Huck Signed-off-by: Jean-Philippe Brucker --- v1: https://lore.kernel.org/qemu-devel/20230421163822.839167-1-jean-phili...@linaro.org/ v1->v2: Initialize check_extension_vm using kvm_vm_ioctl() as suggested --- include/sysemu/kvm.h | 2 -- include/sysemu/kvm_int.h | 1 + accel/kvm/kvm-all.c | 34 +++--- target/arm/kvm.c | 2 +- target/i386/kvm/kvm.c| 6 +++--- target/ppc/kvm.c | 36 ++-- 6 files changed, 38 insertions(+), 43 deletions(-) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index c6f34d4794..df97077434 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -404,8 +404,6 @@ bool kvm_arch_stop_on_emulation_error(CPUState *cpu); int kvm_check_extension(KVMState *s, unsigned int extension); -int kvm_vm_check_extension(KVMState *s, unsigned int extension); - #define kvm_vm_enable_cap(s, capability, cap_flags, ...) \ ({ \ struct kvm_enable_cap cap = {\ diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index cad763e240..fa4c9aeb96 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -123,6 +123,7 @@ struct KVMState uint16_t xen_gnttab_max_frames; uint16_t xen_evtchn_max_pirq; char *device; +bool check_extension_vm; }; void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index e08dd04164..3d9fbc8a98 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1128,7 +1128,11 @@ int kvm_check_extension(KVMState *s, unsigned int extension) { int ret; -ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); +if (!s->check_extension_vm) { +ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); +} else { +ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); +} if (ret < 0) { ret = 0; } @@ -1136,19 +1140,6 @@ int
[PATCH v2 03/22] target/arm/kvm: Return immediately on error in kvm_arch_init()
Returning an error to kvm_init() is fatal anyway, no need to continue the initialization. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- target/arm/kvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 3371ffa401..a5673241e5 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -566,7 +566,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) !kvm_check_extension(s, KVM_CAP_ARM_IRQ_LINE_LAYOUT_2)) { error_report("Using more than 256 vcpus requires a host kernel " "with KVM_CAP_ARM_IRQ_LINE_LAYOUT_2"); -ret = -EINVAL; +return -EINVAL; } if (kvm_check_extension(s, KVM_CAP_ARM_NISV_TO_USER)) { @@ -595,6 +595,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) if (ret < 0) { error_report("Enabling of Eager Page Split failed: %s", strerror(-ret)); +return ret; } } } -- 2.44.0
[PATCH v2 00/22] arm: Run CCA VMs with KVM
These patches enable launching a confidential guest with QEMU KVM on Arm. The KVM changes for CCA have now been posted as v2 [1]. Launching a confidential VM requires two additional command-line parameters: -M confidential-guest-support=rme0 -object rme-guest,id=rme0 Since the RFC [2] I tried to address all review comments, and added a few features: * Enabled support for guest memfd by Xiaoyao Li and Chao Peng [3]. Guest memfd is mandatory for CCA. * Support firmware boot (edk2). * Use CPU command-line arguments for Realm parameters. SVE vector length uses the existing sve -cpu parameters, while breakpoints, watchpoints and PMU counters use new CPU parameters. The full series based on the memfd patches is at: https://git.codelinaro.org/linaro/dcap/qemu.git branch cca/v2 Please find instructions for building and running the whole CCA stack at: https://linaro.atlassian.net/wiki/spaces/QEMU/pages/29051027459/Building+an+RME+stack+for+QEMU [1] https://lore.kernel.org/kvm/20240412084056.1733704-1-steven.pr...@arm.com/ [2] https://lore.kernel.org/all/20230127150727.612594-1-jean-phili...@linaro.org/ [3] https://lore.kernel.org/qemu-devel/20240322181116.1228416-1-pbonz...@redhat.com/ Jean-Philippe Brucker (22): kvm: Merge kvm_check_extension() and kvm_vm_check_extension() target/arm: Add confidential guest support target/arm/kvm: Return immediately on error in kvm_arch_init() target/arm/kvm-rme: Initialize realm hw/arm/virt: Add support for Arm RME hw/arm/virt: Disable DTB randomness for confidential VMs hw/arm/virt: Reserve one bit of guest-physical address for RME target/arm/kvm: Split kvm_arch_get/put_registers target/arm/kvm-rme: Initialize vCPU target/arm/kvm: Create scratch VM as Realm if necessary hw/core/loader: Add ROM loader notifier target/arm/kvm-rme: Populate Realm memory hw/arm/boot: Register Linux BSS section for confidential guests target/arm/kvm-rme: Add Realm Personalization Value parameter target/arm/kvm-rme: Add measurement algorithm property target/arm/cpu: Set number of breakpoints and watchpoints in KVM target/arm/cpu: Set number of PMU counters in KVM target/arm/kvm: Disable Realm reboot target/arm/cpu: Inform about reading confidential CPU registers target/arm/kvm-rme: Enable guest memfd hw/arm/virt: Move virt_flash_create() to machvirt_init() hw/arm/virt: Use RAM instead of flash for confidential guest firmware docs/system/arm/virt.rst | 9 +- docs/system/confidential-guest-support.rst | 1 + qapi/qom.json | 34 +- include/hw/arm/boot.h | 9 + include/hw/arm/virt.h | 2 +- include/hw/loader.h| 15 + include/sysemu/kvm.h | 2 - include/sysemu/kvm_int.h | 1 + target/arm/cpu.h | 10 + target/arm/kvm_arm.h | 25 ++ accel/kvm/kvm-all.c| 34 +- hw/arm/boot.c | 45 ++- hw/arm/virt.c | 118 -- hw/core/loader.c | 15 + target/arm/arm-qmp-cmds.c | 1 + target/arm/cpu.c | 5 + target/arm/cpu64.c | 118 ++ target/arm/kvm-rme.c | 413 + target/arm/kvm.c | 200 +- target/i386/kvm/kvm.c | 6 +- target/ppc/kvm.c | 36 +- target/arm/meson.build | 7 +- 22 files changed, 1023 insertions(+), 83 deletions(-) create mode 100644 target/arm/kvm-rme.c -- 2.44.0
[PATCH v2 22/22] hw/arm/virt: Use RAM instead of flash for confidential guest firmware
The flash device that holds firmware code relies on read-only stage-2 mappings. Read accesses behave as RAM and write accesses as MMIO. Since the RMM does not support read-only mappings we cannot use the flash device as-is. That isn't a problem because the firmware does not want to disclose any information to the host, hence will not store its variables in clear persistent memory. We can therefore replace the flash device with RAM, and load the firmware there. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- include/hw/arm/boot.h | 9 + hw/arm/boot.c | 34 ++--- hw/arm/virt.c | 44 +++ 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/include/hw/arm/boot.h b/include/hw/arm/boot.h index 80c492d742..d91cfc6942 100644 --- a/include/hw/arm/boot.h +++ b/include/hw/arm/boot.h @@ -112,6 +112,10 @@ struct arm_boot_info { */ bool firmware_loaded; +/* Used when loading firmware into RAM */ +hwaddr firmware_base; +hwaddr firmware_max_size; + /* Address at which board specific loader/setup code exists. If enabled, * this code-blob will run before anything else. It must return to the * caller via the link register. There is no stack set up. Enabled by @@ -132,6 +136,11 @@ struct arm_boot_info { bool secure_board_setup; arm_endianness endianness; + +/* + * Confidential guest boot loads everything into RAM so it can be measured. + */ +bool confidential; }; /** diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 9f522e332b..26c6334d52 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -1154,7 +1154,31 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, } } -static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) +static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, + struct arm_boot_info *info, + const char *firmware_filename) +{ +ssize_t fw_size; +const char *fname; +AddressSpace *as = arm_boot_address_space(cpu, info); + +fname = qemu_find_file(QEMU_FILE_TYPE_BIOS, firmware_filename); +if (!fname) { +error_report("Could not find firmware image '%s'", firmware_filename); +exit(1); +} + +fw_size = load_image_targphys_as(firmware_filename, + info->firmware_base, + info->firmware_max_size, as); +if (fw_size <= 0) { +error_report("could not load firmware '%s'", firmware_filename); +exit(1); +} +} + +static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, +const char *firmware_filename) { /* Set up for booting firmware (which might load a kernel via fw_cfg) */ @@ -1205,6 +1229,10 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) } } +if (info->confidential) { +arm_setup_confidential_firmware_boot(cpu, info, firmware_filename); +} + /* * We will start from address 0 (typically a boot ROM image) in the * same way as hardware. Leave env->boot_info NULL, so that @@ -1243,9 +1271,9 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) info->dtb_filename = ms->dtb; info->dtb_limit = 0; -/* Load the kernel. */ +/* Load the kernel and/or firmware. */ if (!info->kernel_filename || info->firmware_loaded) { -arm_setup_firmware_boot(cpu, info); +arm_setup_firmware_boot(cpu, info, ms->firmware); } else { arm_setup_direct_kernel_boot(cpu, info); } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index bed19d0b79..4a6281fc89 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1178,6 +1178,10 @@ static PFlashCFI01 *virt_flash_create1(VirtMachineState *vms, static void virt_flash_create(VirtMachineState *vms) { +if (virt_machine_is_confidential(vms)) { +return; +} + vms->flash[0] = virt_flash_create1(vms, "virt.flash0", "pflash0"); vms->flash[1] = virt_flash_create1(vms, "virt.flash1", "pflash1"); } @@ -1213,6 +1217,10 @@ static void virt_flash_map(VirtMachineState *vms, hwaddr flashsize = vms->memmap[VIRT_FLASH].size / 2; hwaddr flashbase = vms->memmap[VIRT_FLASH].base; +if (virt_machine_is_confidential(vms)) { +return; +} + virt_flash_map1(vms->flash[0], flashbase, flashsize, secure_sysmem); virt_flash_map1(vms->flash[1], flashbase + flashsize, flashsize, @@ -1228,6 +1236,10 @@ static void virt_flash_fdt(VirtMachineState *vms, MachineState *ms = MACHINE(vms); char *nodename; +if (virt_machine_is_confiden
[PATCH v2 20/22] target/arm/kvm-rme: Enable guest memfd
Request that RAM block uses the KVM guest memfd call to allocate guest memory. With RME, guest memory is not accessible by the host, and using guest memfd ensures that the host kernel is aware of this and doesn't attempt to access guest pages. Done in a separate patch because ms->require_guest_memfd is not yet merged. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- target/arm/kvm-rme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 8f39e54aaa..71cc1d4147 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -263,6 +263,7 @@ int kvm_arm_rme_init(MachineState *ms) rme_guest->rom_load_notifier.notify = rme_rom_load_notify; rom_add_load_notifier(&rme_guest->rom_load_notifier); +ms->require_guest_memfd = true; cgs->ready = true; return 0; } -- 2.44.0
[PATCH v2 17/22] target/arm/cpu: Set number of PMU counters in KVM
Add a "num-pmu-counters" CPU parameter to configure the number of counters that KVM presents to the guest. This is needed for Realm VMs, whose parameters include the number of PMU counters and influence the Realm Initial Measurement. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- target/arm/cpu.h | 3 +++ target/arm/kvm_arm.h | 1 + target/arm/arm-qmp-cmds.c | 2 +- target/arm/cpu64.c| 41 +++ target/arm/kvm.c | 34 +++- 5 files changed, 79 insertions(+), 2 deletions(-) diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 24080da2b7..84f3a67dab 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -1093,6 +1093,7 @@ struct ArchCPU { /* Allows to override the default configuration */ uint8_t num_bps; uint8_t num_wps; +int8_t num_pmu_ctrs; }; typedef struct ARMCPUInfo { @@ -2312,6 +2313,8 @@ FIELD(MFAR, FPA, 12, 40) FIELD(MFAR, NSE, 62, 1) FIELD(MFAR, NS, 63, 1) +FIELD(PMCR, N, 11, 5) + QEMU_BUILD_BUG_ON(ARRAY_SIZE(((ARMCPU *)0)->ccsidr) <= R_V7M_CSSELR_INDEX_MASK); /* If adding a feature bit which corresponds to a Linux ELF diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index b040686eab..62e39e7184 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -17,6 +17,7 @@ #define KVM_ARM_VGIC_V3 (1 << 1) #define KVM_REG_ARM_ID_AA64DFR0_EL1 ARM64_SYS_REG(3, 0, 0, 5, 0) +#define KVM_REG_ARM_PMCR_EL0ARM64_SYS_REG(3, 3, 9, 12, 0) /** * kvm_arm_register_device: diff --git a/target/arm/arm-qmp-cmds.c b/target/arm/arm-qmp-cmds.c index 0f574bb1dd..985d4270b8 100644 --- a/target/arm/arm-qmp-cmds.c +++ b/target/arm/arm-qmp-cmds.c @@ -95,7 +95,7 @@ static const char *cpu_model_advertised_features[] = { "sve1408", "sve1536", "sve1664", "sve1792", "sve1920", "sve2048", "kvm-no-adjvtime", "kvm-steal-time", "pauth", "pauth-impdef", "pauth-qarma3", -"num-breakpoints", "num-watchpoints", +"num-breakpoints", "num-watchpoints", "num-pmu-counters", NULL }; diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 9ca74eb019..6c2b922d93 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -638,12 +638,53 @@ static void arm_cpu_set_num_bps(Object *obj, Visitor *v, const char *name, cpu->num_bps = val; } +static void arm_cpu_get_num_pmu_ctrs(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ +uint8_t val; +ARMCPU *cpu = ARM_CPU(obj); + +if (cpu->num_pmu_ctrs == -1) { +val = FIELD_EX64(cpu->isar.reset_pmcr_el0, PMCR, N); +} else { +val = cpu->num_pmu_ctrs; +} + +visit_type_uint8(v, name, &val, errp); +} + +static void arm_cpu_set_num_pmu_ctrs(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ +uint8_t val; +ARMCPU *cpu = ARM_CPU(obj); +uint8_t max_ctrs = FIELD_EX64(cpu->isar.reset_pmcr_el0, PMCR, N); + +if (!visit_type_uint8(v, name, &val, errp)) { +return; +} + +if (val > max_ctrs) { +error_setg(errp, "invalid number of PMU counters"); +return; +} + +cpu->num_pmu_ctrs = val; +} + static void aarch64_add_kvm_writable_properties(Object *obj) { +ARMCPU *cpu = ARM_CPU(obj); + object_property_add(obj, "num-breakpoints", "uint8", arm_cpu_get_num_bps, arm_cpu_set_num_bps, NULL, NULL); object_property_add(obj, "num-watchpoints", "uint8", arm_cpu_get_num_wps, arm_cpu_set_num_wps, NULL, NULL); + +cpu->num_pmu_ctrs = -1; +object_property_add(obj, "num-pmu-counters", "uint8", +arm_cpu_get_num_pmu_ctrs, arm_cpu_set_num_pmu_ctrs, +NULL, NULL); } #endif /* CONFIG_KVM */ diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 623980a25b..9855cadb1b 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -418,7 +418,7 @@ static bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf) if (pmu_supported) { /* PMCR_EL0 is only accessible if the vCPU has feature PMU_V3 */ err |= read_sys_reg64(fdarray[2], &ahcf->isar.reset_pmcr_el0, - ARM64_SYS_REG(3, 3, 9, 12, 0)); + KVM_REG_ARM_PMCR_EL0); } if (sve_supported) { @@ -919,9 +919,41 @@ static void kvm_arm_configure_aa64dfr0(ARMCPU *cpu) } } +static void kvm_arm_configure_pmcr(ARMCPU *cpu) +{ +int ret; +uint64_t val, newval; +CPUState *cs = CPU(cpu); + +if (cpu->num_pmu_ctrs == -1) { +
[PATCH v2 21/22] hw/arm/virt: Move virt_flash_create() to machvirt_init()
For confidential VMs we'll want to skip flash device creation. Unfortunately, in virt_instance_init() the machine->cgs member has not yet been initialized, so we cannot check whether confidential guest is enabled. Move virt_flash_create() to machvirt_init(), where we can access the machine->cgs member. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- hw/arm/virt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index eca9a96b5a..bed19d0b79 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2071,6 +2071,8 @@ static void machvirt_init(MachineState *machine) unsigned int smp_cpus = machine->smp.cpus; unsigned int max_cpus = machine->smp.max_cpus; +virt_flash_create(vms); + possible_cpus = mc->possible_cpu_arch_ids(machine); /* @@ -3229,8 +3231,6 @@ static void virt_instance_init(Object *obj) vms->irqmap = a15irqmap; -virt_flash_create(vms); - vms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); vms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); } -- 2.44.0
[PATCH v2 12/22] target/arm/kvm-rme: Populate Realm memory
Collect the images copied into guest RAM into a sorted list, and issue POPULATE_REALM KVM ioctls once we've created the Realm Descriptor. The images are part of the Realm Initial Measurement. Signed-off-by: Jean-Philippe Brucker --- v1->v2: Use a ROM loader notifier --- target/arm/kvm-rme.c | 97 1 file changed, 97 insertions(+) diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index aa9c3b5551..bee6694d6d 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -9,9 +9,11 @@ #include "exec/confidential-guest-support.h" #include "hw/boards.h" #include "hw/core/cpu.h" +#include "hw/loader.h" #include "kvm_arm.h" #include "migration/blocker.h" #include "qapi/error.h" +#include "qemu/error-report.h" #include "qom/object_interfaces.h" #include "sysemu/kvm.h" #include "sysemu/runstate.h" @@ -19,10 +21,21 @@ #define TYPE_RME_GUEST "rme-guest" OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) +#define RME_PAGE_SIZE qemu_real_host_page_size() + struct RmeGuest { ConfidentialGuestSupport parent_obj; +Notifier rom_load_notifier; +GSList *ram_regions; }; +typedef struct { +hwaddr base; +hwaddr len; +/* Populate guest RAM with data, or only initialize the IPA range */ +bool populate; +} RmeRamRegion; + static RmeGuest *rme_guest; bool kvm_arm_rme_enabled(void) @@ -41,6 +54,41 @@ static int rme_create_rd(Error **errp) return ret; } +static void rme_populate_realm(gpointer data, gpointer unused) +{ +int ret; +const RmeRamRegion *region = data; + +if (region->populate) { +struct kvm_cap_arm_rme_populate_realm_args populate_args = { +.populate_ipa_base = region->base, +.populate_ipa_size = region->len, +.flags = KVM_ARM_RME_POPULATE_FLAGS_MEASURE, +}; +ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_POPULATE_REALM, +(intptr_t)&populate_args); +if (ret) { +error_report("RME: failed to populate realm (0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx"): %s", + region->base, region->len, strerror(-ret)); +exit(1); +} +} else { +struct kvm_cap_arm_rme_init_ipa_args init_args = { +.init_ipa_base = region->base, +.init_ipa_size = region->len, +}; +ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_INIT_IPA_REALM, +(intptr_t)&init_args); +if (ret) { +error_report("RME: failed to initialize GPA range (0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx"): %s", + region->base, region->len, strerror(-ret)); +exit(1); +} +} +} + static void rme_vm_state_change(void *opaque, bool running, RunState state) { int ret; @@ -55,6 +103,9 @@ static void rme_vm_state_change(void *opaque, bool running, RunState state) return; } +g_slist_foreach(rme_guest->ram_regions, rme_populate_realm, NULL); +g_slist_free_full(g_steal_pointer(&rme_guest->ram_regions), g_free); + /* * Now that do_cpu_reset() initialized the boot PC and * kvm_cpu_synchronize_post_reset() registered it, we can finalize the REC. @@ -75,6 +126,49 @@ static void rme_vm_state_change(void *opaque, bool running, RunState state) } } +static gint rme_compare_ram_regions(gconstpointer a, gconstpointer b) +{ +const RmeRamRegion *ra = a; +const RmeRamRegion *rb = b; + +g_assert(ra->base != rb->base); +return ra->base < rb->base ? -1 : 1; +} + +static void rme_add_ram_region(hwaddr base, hwaddr len, bool populate) +{ +RmeRamRegion *region; + +region = g_new0(RmeRamRegion, 1); +region->base = QEMU_ALIGN_DOWN(base, RME_PAGE_SIZE); +region->len = QEMU_ALIGN_UP(len, RME_PAGE_SIZE); +region->populate = populate; + +/* + * The Realm Initial Measurement (RIM) depends on the order in which we + * initialize and populate the RAM regions. To help a verifier + * independently calculate the RIM, sort regions by GPA. + */ +rme_guest->ram_regions = g_slist_insert_sorted(rme_guest->ram_regions, + region, + rme_compare_ram_regions); +} + +static void rme_rom_load_notify(Notifier *notifier, void *data) +{ +RomLoaderNotify *rom = data; + +if (rom->addr == -1) { +/* + * These blobs (ACPI tables) are not loaded into guest RAM at reset. + * Instead the firmware will load th
[PATCH v2 11/22] hw/core/loader: Add ROM loader notifier
Add a function to register a notifier, that is invoked after a ROM gets loaded into guest memory. It will be used by Arm confidential guest support, in order to register all blobs loaded into memory with KVM, so that their content is part of the initial VM measurement and contribute to the guest attestation. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- include/hw/loader.h | 15 +++ hw/core/loader.c| 15 +++ 2 files changed, 30 insertions(+) diff --git a/include/hw/loader.h b/include/hw/loader.h index 8685e27334..79fab25dd9 100644 --- a/include/hw/loader.h +++ b/include/hw/loader.h @@ -356,6 +356,21 @@ void hmp_info_roms(Monitor *mon, const QDict *qdict); ssize_t rom_add_vga(const char *file); ssize_t rom_add_option(const char *file, int32_t bootindex); +typedef struct RomLoaderNotify { +/* Parameters passed to rom_add_blob() */ +hwaddr addr; +size_t len; +size_t max_len; +} RomLoaderNotify; + +/** + * rom_add_load_notifier - Add a notifier for loaded images + * + * Add a notifier that will be invoked with a RomLoaderNotify structure for each + * blob loaded into guest memory, after the blob is loaded. + */ +void rom_add_load_notifier(Notifier *notifier); + /* This is the usual maximum in uboot, so if a uImage overflows this, it would * overflow on real hardware too. */ #define UBOOT_MAX_GUNZIP_BYTES (64 << 20) diff --git a/hw/core/loader.c b/hw/core/loader.c index b8e52f3fb0..4bd236cf89 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -67,6 +67,8 @@ #include static int roms_loaded; +static NotifierList rom_loader_notifier = +NOTIFIER_LIST_INITIALIZER(rom_loader_notifier); /* return the size or -1 if error */ int64_t get_image_size(const char *filename) @@ -1209,6 +1211,11 @@ MemoryRegion *rom_add_blob(const char *name, const void *blob, size_t len, return mr; } +void rom_add_load_notifier(Notifier *notifier) +{ +notifier_list_add(&rom_loader_notifier, notifier); +} + /* This function is specific for elf program because we don't need to allocate * all the rom. We just allocate the first part and the rest is just zeros. This * is why romsize and datasize are different. Also, this function takes its own @@ -1250,6 +1257,7 @@ ssize_t rom_add_option(const char *file, int32_t bootindex) static void rom_reset(void *unused) { Rom *rom; +RomLoaderNotify notify; QTAILQ_FOREACH(rom, &roms, next) { if (rom->fw_file) { @@ -1298,6 +1306,13 @@ static void rom_reset(void *unused) cpu_flush_icache_range(rom->addr, rom->datasize); trace_loader_write_rom(rom->name, rom->addr, rom->datasize, rom->isrom); + +notify = (RomLoaderNotify) { +.addr = rom->addr, +.len = rom->datasize, +.max_len = rom->romsize, +}; +notifier_list_notify(&rom_loader_notifier, ¬ify); } } -- 2.44.0
[PATCH v2 16/22] target/arm/cpu: Set number of breakpoints and watchpoints in KVM
Add "num-breakpoints" and "num-watchpoints" CPU parameters to configure the debug features that KVM presents to the guest. The KVM vCPU configuration is modified by calling SET_ONE_REG on the ID register. This is needed for Realm VMs, whose parameters include breakpoints and watchpoints, and influence the Realm Initial Measurement. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- target/arm/cpu.h | 4 ++ target/arm/kvm_arm.h | 2 + target/arm/arm-qmp-cmds.c | 1 + target/arm/cpu64.c| 77 +++ target/arm/kvm.c | 56 +++- 5 files changed, 139 insertions(+), 1 deletion(-) diff --git a/target/arm/cpu.h b/target/arm/cpu.h index d3ff1b4a31..24080da2b7 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -1089,6 +1089,10 @@ struct ArchCPU { /* Generic timer counter frequency, in Hz */ uint64_t gt_cntfrq_hz; + +/* Allows to override the default configuration */ +uint8_t num_bps; +uint8_t num_wps; }; typedef struct ARMCPUInfo { diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 4b787dd628..b040686eab 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -16,6 +16,8 @@ #define KVM_ARM_VGIC_V2 (1 << 0) #define KVM_ARM_VGIC_V3 (1 << 1) +#define KVM_REG_ARM_ID_AA64DFR0_EL1 ARM64_SYS_REG(3, 0, 0, 5, 0) + /** * kvm_arm_register_device: * @mr: memory region for this device diff --git a/target/arm/arm-qmp-cmds.c b/target/arm/arm-qmp-cmds.c index 3cc8cc738b..0f574bb1dd 100644 --- a/target/arm/arm-qmp-cmds.c +++ b/target/arm/arm-qmp-cmds.c @@ -95,6 +95,7 @@ static const char *cpu_model_advertised_features[] = { "sve1408", "sve1536", "sve1664", "sve1792", "sve1920", "sve2048", "kvm-no-adjvtime", "kvm-steal-time", "pauth", "pauth-impdef", "pauth-qarma3", +"num-breakpoints", "num-watchpoints", NULL }; diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 985b1efe16..9ca74eb019 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -571,6 +571,82 @@ void aarch64_add_pauth_properties(Object *obj) } } +#if defined(CONFIG_KVM) +static void arm_cpu_get_num_wps(Object *obj, Visitor *v, const char *name, +void *opaque, Error **errp) +{ +uint8_t val; +ARMCPU *cpu = ARM_CPU(obj); + +val = cpu->num_wps; +if (val == 0) { +val = FIELD_EX64(cpu->isar.id_aa64dfr0, ID_AA64DFR0, WRPS) + 1; +} + +visit_type_uint8(v, name, &val, errp); +} + +static void arm_cpu_set_num_wps(Object *obj, Visitor *v, const char *name, +void *opaque, Error **errp) +{ +uint8_t val; +ARMCPU *cpu = ARM_CPU(obj); +uint8_t max_wps = FIELD_EX64(cpu->isar.id_aa64dfr0, ID_AA64DFR0, WRPS) + 1; + +if (!visit_type_uint8(v, name, &val, errp)) { +return; +} + +if (val < 2 || val > max_wps) { +error_setg(errp, "invalid number of watchpoints"); +return; +} + +cpu->num_wps = val; +} + +static void arm_cpu_get_num_bps(Object *obj, Visitor *v, const char *name, +void *opaque, Error **errp) +{ +uint8_t val; +ARMCPU *cpu = ARM_CPU(obj); + +val = cpu->num_bps; +if (val == 0) { +val = FIELD_EX64(cpu->isar.id_aa64dfr0, ID_AA64DFR0, BRPS) + 1; +} + +visit_type_uint8(v, name, &val, errp); +} + +static void arm_cpu_set_num_bps(Object *obj, Visitor *v, const char *name, +void *opaque, Error **errp) +{ +uint8_t val; +ARMCPU *cpu = ARM_CPU(obj); +uint8_t max_bps = FIELD_EX64(cpu->isar.id_aa64dfr0, ID_AA64DFR0, BRPS) + 1; + +if (!visit_type_uint8(v, name, &val, errp)) { +return; +} + +if (val < 2 || val > max_bps) { +error_setg(errp, "invalid number of breakpoints"); +return; +} + +cpu->num_bps = val; +} + +static void aarch64_add_kvm_writable_properties(Object *obj) +{ +object_property_add(obj, "num-breakpoints", "uint8", arm_cpu_get_num_bps, +arm_cpu_set_num_bps, NULL, NULL); +object_property_add(obj, "num-watchpoints", "uint8", arm_cpu_get_num_wps, +arm_cpu_set_num_wps, NULL, NULL); +} +#endif /* CONFIG_KVM */ + void arm_cpu_lpa2_finalize(ARMCPU *cpu, Error **errp) { uint64_t t; @@ -713,6 +789,7 @@ static void aarch64_host_initfn(Object *obj) if (arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) { aarch64_add_sve_properties(obj); aarch64_add_pauth_properties(obj); +aarch64_add_kvm_writable_properties(obj); } #elif defined(CONFIG_HVF) ARMCPU *cpu = ARM_CPU(obj); dif
[PATCH v2 09/22] target/arm/kvm-rme: Initialize vCPU
The target code calls kvm_arm_vcpu_init() to mark the vCPU as part of a Realm. For a Realm vCPU, only x0-x7 can be set at runtime. Before boot, the PC can also be set, and is ignored at runtime. KVM also accepts a few system register changes during initial configuration, as returned by KVM_GET_REG_LIST. Signed-off-by: Jean-Philippe Brucker --- v1->v2: only do the GP regs, since they are sync'd explicitly. Other registers use the existing reglist facility. --- target/arm/cpu.h | 3 +++ target/arm/kvm_arm.h | 1 + target/arm/kvm-rme.c | 10 target/arm/kvm.c | 61 4 files changed, 75 insertions(+) diff --git a/target/arm/cpu.h b/target/arm/cpu.h index bc0c84873f..d3ff1b4a31 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -945,6 +945,9 @@ struct ArchCPU { OnOffAuto kvm_steal_time; #endif /* CONFIG_KVM */ +/* Realm Management Extension */ +bool kvm_rme; + /* Uniprocessor system with MP extensions */ bool mp_is_up; diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 8e2d90c265..4386b0 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -220,6 +220,7 @@ int kvm_arm_rme_init(MachineState *ms); int kvm_arm_rme_vm_type(MachineState *ms); bool kvm_arm_rme_enabled(void); +int kvm_arm_rme_vcpu_init(CPUState *cs); #else diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 23ac2d32d4..aa9c3b5551 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -106,6 +106,16 @@ int kvm_arm_rme_init(MachineState *ms) return 0; } +int kvm_arm_rme_vcpu_init(CPUState *cs) +{ +ARMCPU *cpu = ARM_CPU(cs); + +if (rme_guest) { +cpu->kvm_rme = true; +} +return 0; +} + int kvm_arm_rme_vm_type(MachineState *ms) { if (rme_guest) { diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 3504276822..3a2233ec73 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -1920,6 +1920,11 @@ int kvm_arch_init_vcpu(CPUState *cs) return ret; } +ret = kvm_arm_rme_vcpu_init(cs); +if (ret) { +return ret; +} + if (cpu_isar_feature(aa64_sve, cpu)) { ret = kvm_arm_sve_set_vls(cpu); if (ret) { @@ -2056,6 +2061,35 @@ static int kvm_arch_put_sve(CPUState *cs) return 0; } +static int kvm_arm_rme_put_core_regs(CPUState *cs) +{ +int i, ret; +struct kvm_one_reg reg; +ARMCPU *cpu = ARM_CPU(cs); +CPUARMState *env = &cpu->env; + +/* + * The RME ABI only allows us to set 8 GPRs and the PC + */ +for (i = 0; i < 8; i++) { +reg.id = AARCH64_CORE_REG(regs.regs[i]); +reg.addr = (uintptr_t) &env->xregs[i]; +ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); +if (ret) { +return ret; +} +} + +reg.id = AARCH64_CORE_REG(regs.pc); +reg.addr = (uintptr_t) &env->pc; +ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); +if (ret) { +return ret; +} + +return 0; +} + static int kvm_arm_put_core_regs(CPUState *cs, int level) { uint64_t val; @@ -2066,6 +2100,10 @@ static int kvm_arm_put_core_regs(CPUState *cs, int level) ARMCPU *cpu = ARM_CPU(cs); CPUARMState *env = &cpu->env; +if (cpu->kvm_rme) { +return kvm_arm_rme_put_core_regs(cs); +} + /* If we are in AArch32 mode then we need to copy the AArch32 regs to the * AArch64 registers before pushing them out to 64-bit KVM. */ @@ -2253,6 +2291,25 @@ static int kvm_arch_get_sve(CPUState *cs) return 0; } +static int kvm_arm_rme_get_core_regs(CPUState *cs) +{ +int i, ret; +struct kvm_one_reg reg; +ARMCPU *cpu = ARM_CPU(cs); +CPUARMState *env = &cpu->env; + +for (i = 0; i < 8; i++) { +reg.id = AARCH64_CORE_REG(regs.regs[i]); +reg.addr = (uintptr_t) &env->xregs[i]; +ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); +if (ret) { +return ret; +} +} + +return 0; +} + static int kvm_arm_get_core_regs(CPUState *cs) { uint64_t val; @@ -2263,6 +2320,10 @@ static int kvm_arm_get_core_regs(CPUState *cs) ARMCPU *cpu = ARM_CPU(cs); CPUARMState *env = &cpu->env; +if (cpu->kvm_rme) { +return kvm_arm_rme_get_core_regs(cs); +} + for (i = 0; i < 31; i++) { ret = kvm_get_one_reg(cs, AARCH64_CORE_REG(regs.regs[i]), &env->xregs[i]); -- 2.44.0
[PATCH v2 04/22] target/arm/kvm-rme: Initialize realm
The machine code calls kvm_arm_rme_vm_type() to get the VM flag and KVM calls kvm_arm_rme_init() to issue KVM hypercalls: * create the realm descriptor, * load images into Realm RAM (in another patch), * finalize the REC (vCPU) after the registers are reset, * activate the realm at the end, at which point the realm is sealed. Signed-off-by: Jean-Philippe Brucker --- v1->v2: * Use g_assert_not_reached() in stubs * Init from kvm_arch_init() rather than hw/arm/virt * Cache rme_guest --- target/arm/kvm_arm.h | 16 +++ target/arm/kvm-rme.c | 101 +++ target/arm/kvm.c | 7 ++- 3 files changed, 123 insertions(+), 1 deletion(-) diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index cfaa0d9bc7..8e2d90c265 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -203,6 +203,8 @@ int kvm_arm_vgic_probe(void); void kvm_arm_pmu_init(ARMCPU *cpu); void kvm_arm_pmu_set_irq(ARMCPU *cpu, int irq); +int kvm_arm_vcpu_finalize(ARMCPU *cpu, int feature); + /** * kvm_arm_pvtime_init: * @cpu: ARMCPU @@ -214,6 +216,11 @@ void kvm_arm_pvtime_init(ARMCPU *cpu, uint64_t ipa); int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); +int kvm_arm_rme_init(MachineState *ms); +int kvm_arm_rme_vm_type(MachineState *ms); + +bool kvm_arm_rme_enabled(void); + #else /* @@ -283,6 +290,15 @@ static inline uint32_t kvm_arm_sve_get_vls(ARMCPU *cpu) g_assert_not_reached(); } +static inline int kvm_arm_rme_init(MachineState *ms) +{ +g_assert_not_reached(); +} + +static inline int kvm_arm_rme_vm_type(MachineState *ms) +{ +g_assert_not_reached(); +} #endif #endif diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 960dd75608..23ac2d32d4 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -23,14 +23,115 @@ struct RmeGuest { ConfidentialGuestSupport parent_obj; }; +static RmeGuest *rme_guest; + +bool kvm_arm_rme_enabled(void) +{ +return !!rme_guest; +} + +static int rme_create_rd(Error **errp) +{ +int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_CREATE_RD); + +if (ret) { +error_setg_errno(errp, -ret, "RME: failed to create Realm Descriptor"); +} +return ret; +} + +static void rme_vm_state_change(void *opaque, bool running, RunState state) +{ +int ret; +CPUState *cs; + +if (!running) { +return; +} + +ret = rme_create_rd(&error_abort); +if (ret) { +return; +} + +/* + * Now that do_cpu_reset() initialized the boot PC and + * kvm_cpu_synchronize_post_reset() registered it, we can finalize the REC. + */ +CPU_FOREACH(cs) { +ret = kvm_arm_vcpu_finalize(ARM_CPU(cs), KVM_ARM_VCPU_REC); +if (ret) { +error_report("RME: failed to finalize vCPU: %s", strerror(-ret)); +exit(1); +} +} + +ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_ACTIVATE_REALM); +if (ret) { +error_report("RME: failed to activate realm: %s", strerror(-ret)); +exit(1); +} +} + +int kvm_arm_rme_init(MachineState *ms) +{ +static Error *rme_mig_blocker; +ConfidentialGuestSupport *cgs = ms->cgs; + +if (!rme_guest) { +return 0; +} + +if (!cgs) { +error_report("missing -machine confidential-guest-support parameter"); +return -EINVAL; +} + +if (!kvm_check_extension(kvm_state, KVM_CAP_ARM_RME)) { +return -ENODEV; +} + +error_setg(&rme_mig_blocker, "RME: migration is not implemented"); +migrate_add_blocker(&rme_mig_blocker, &error_fatal); + +/* + * The realm activation is done last, when the VM starts, after all images + * have been loaded and all vcpus finalized. + */ +qemu_add_vm_change_state_handler(rme_vm_state_change, NULL); + +cgs->ready = true; +return 0; +} + +int kvm_arm_rme_vm_type(MachineState *ms) +{ +if (rme_guest) { +return KVM_VM_TYPE_ARM_REALM; +} +return 0; +} + static void rme_guest_class_init(ObjectClass *oc, void *data) { } +static void rme_guest_instance_init(Object *obj) +{ +if (rme_guest) { +error_report("a single instance of RmeGuest is supported"); +exit(1); +} +rme_guest = RME_GUEST(obj); +} + static const TypeInfo rme_guest_info = { .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, .name = TYPE_RME_GUEST, .instance_size = sizeof(struct RmeGuest), +.instance_init = rme_guest_instance_init, .class_init = rme_guest_class_init, .interfaces = (InterfaceInfo[]) { { TYPE_USER_CREATABLE }, diff --git a/target/arm/kvm.c b/target/arm/kvm.c index a5673241e5..b00077c1a5 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -93,7 +93,7 @@ static int kvm_arm_vcpu_init(ARMCPU *cpu) * * Returns: 0 i
[PATCH v2 15/22] target/arm/kvm-rme: Add measurement algorithm property
This option selects which measurement algorithm to use for attestation. Supported values are SHA256 and SHA512. Default to SHA512 arbitrarily. SHA512 is generally faster on 64-bit architectures. On a few arm64 CPUs I tested SHA256 is much faster, but that's most likely because they only support acceleration via FEAT_SHA256 (Armv8.0) and not FEAT_SHA512 (Armv8.2). Future CPUs supporting RME are likely to also support FEAT_SHA512. Cc: Eric Blake Cc: Markus Armbruster Cc: Daniel P. Berrangé Cc: Eduardo Habkost Signed-off-by: Jean-Philippe Brucker --- v1->v2: use enum, pick default --- qapi/qom.json| 18 +- target/arm/kvm-rme.c | 39 ++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/qapi/qom.json b/qapi/qom.json index 91654aa267..84dce666b2 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -931,18 +931,34 @@ 'data': { '*cpu-affinity': ['uint16'], '*node-affinity': ['uint16'] } } +## +# @RmeGuestMeasurementAlgo: +# +# @sha256: Use the SHA256 algorithm +# @sha512: Use the SHA512 algorithm +# +# Algorithm to use for realm measurements +# +# Since: FIXME +## +{ 'enum': 'RmeGuestMeasurementAlgo', + 'data': ['sha256', 'sha512'] } + ## # @RmeGuestProperties: # # Properties for rme-guest objects. # +# @measurement-algo: Realm measurement algorithm (default: sha512) +# # @personalization-value: Realm personalization value, as a 64-byte hex string # (default: 0) # # Since: FIXME ## { 'struct': 'RmeGuestProperties', - 'data': { '*personalization-value': 'str' } } + 'data': { '*personalization-value': 'str', +'*measurement-algo': 'RmeGuestMeasurementAlgo' } } ## # @ObjectType: diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index cb5c3f7a22..8f39e54aaa 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -23,13 +23,14 @@ OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) #define RME_PAGE_SIZE qemu_real_host_page_size() -#define RME_MAX_CFG 1 +#define RME_MAX_CFG 2 struct RmeGuest { ConfidentialGuestSupport parent_obj; Notifier rom_load_notifier; GSList *ram_regions; uint8_t *personalization_value; +RmeGuestMeasurementAlgo measurement_algo; }; typedef struct { @@ -73,6 +74,19 @@ static int rme_configure_one(RmeGuest *guest, uint32_t cfg, Error **errp) memcpy(args.rpv, guest->personalization_value, KVM_CAP_ARM_RME_RPV_SIZE); cfg_str = "personalization value"; break; +case KVM_CAP_ARM_RME_CFG_HASH_ALGO: +switch (guest->measurement_algo) { +case RME_GUEST_MEASUREMENT_ALGO_SHA256: +args.hash_algo = KVM_CAP_ARM_RME_MEASUREMENT_ALGO_SHA256; +break; +case RME_GUEST_MEASUREMENT_ALGO_SHA512: +args.hash_algo = KVM_CAP_ARM_RME_MEASUREMENT_ALGO_SHA512; +break; +default: +g_assert_not_reached(); +} +cfg_str = "hash algorithm"; +break; default: g_assert_not_reached(); } @@ -338,12 +352,34 @@ static void rme_set_rpv(Object *obj, const char *value, Error **errp) } } +static int rme_get_measurement_algo(Object *obj, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); + +return guest->measurement_algo; +} + +static void rme_set_measurement_algo(Object *obj, int algo, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); + +guest->measurement_algo = algo; +} + static void rme_guest_class_init(ObjectClass *oc, void *data) { object_class_property_add_str(oc, "personalization-value", rme_get_rpv, rme_set_rpv); object_class_property_set_description(oc, "personalization-value", "Realm personalization value (512-bit hexadecimal number)"); + +object_class_property_add_enum(oc, "measurement-algo", + "RmeGuestMeasurementAlgo", + &RmeGuestMeasurementAlgo_lookup, + rme_get_measurement_algo, + rme_set_measurement_algo); +object_class_property_set_description(oc, "measurement-algo", +"Realm measurement algorithm ('sha256', 'sha512')"); } static void rme_guest_instance_init(Object *obj) @@ -353,6 +389,7 @@ static void rme_guest_instance_init(Object *obj) exit(1); } rme_guest = RME_GUEST(obj); +rme_guest->measurement_algo = RME_GUEST_MEASUREMENT_ALGO_SHA512; } static const TypeInfo rme_guest_info = { -- 2.44.0
[PATCH v2 13/22] hw/arm/boot: Register Linux BSS section for confidential guests
Although the BSS section is not currently part of the kernel blob, it needs to be registered as guest RAM for confidential guest support, because the kernel needs to access it before it is able to setup its RAM regions. It would be tempting to simply add the BSS as part of the ROM blob (ie pass kernel_size as max_len argument to rom_add_blob()) and let the ROM loader notifier deal with the full image size generically, but that would add zero-initialization of the BSS region by the loader, which adds a significant overhead. For a 40MB kernel with a 17MB BSS, I measured an average boot time regression of 2.8ms on a fast desktop, 5.7% of the QEMU setup time). On a slower host, the regression could be much larger. Instead, add a special case to initialize the kernel's BSS IPA range. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- target/arm/kvm_arm.h | 5 + hw/arm/boot.c| 11 +++ target/arm/kvm-rme.c | 10 ++ 3 files changed, 26 insertions(+) diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 4386b0..4b787dd628 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -218,6 +218,7 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); int kvm_arm_rme_init(MachineState *ms); int kvm_arm_rme_vm_type(MachineState *ms); +void kvm_arm_rme_init_guest_ram(hwaddr base, size_t size); bool kvm_arm_rme_enabled(void); int kvm_arm_rme_vcpu_init(CPUState *cs); @@ -243,6 +244,10 @@ static inline bool kvm_arm_sve_supported(void) return false; } +static inline void kvm_arm_rme_init_guest_ram(hwaddr base, size_t size) +{ +} + /* * These functions should never actually be called without KVM support. */ diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 84ea6a807a..9f522e332b 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -26,6 +26,7 @@ #include "qemu/config-file.h" #include "qemu/option.h" #include "qemu/units.h" +#include "kvm_arm.h" /* Kernel boot protocol is specified in the kernel docs * Documentation/arm/Booting and Documentation/arm64/booting.txt @@ -850,6 +851,7 @@ static uint64_t load_aarch64_image(const char *filename, hwaddr mem_base, { hwaddr kernel_load_offset = KERNEL64_LOAD_ADDR; uint64_t kernel_size = 0; +uint64_t page_size; uint8_t *buffer; int size; @@ -916,6 +918,15 @@ static uint64_t load_aarch64_image(const char *filename, hwaddr mem_base, *entry = mem_base + kernel_load_offset; rom_add_blob_fixed_as(filename, buffer, size, *entry, as); +/* + * Register the kernel BSS as realm resource, so the kernel can use it right + * away. Align up to skip the last page, which still contains kernel + * data. + */ +page_size = qemu_real_host_page_size(); +kvm_arm_rme_init_guest_ram(QEMU_ALIGN_UP(*entry + size, page_size), + QEMU_ALIGN_DOWN(kernel_size - size, page_size)); + g_free(buffer); return kernel_size; diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index bee6694d6d..b2ad10ef6d 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -203,6 +203,16 @@ int kvm_arm_rme_init(MachineState *ms) return 0; } +/* + * kvm_arm_rme_init_guest_ram - Initialize a Realm IPA range + */ +void kvm_arm_rme_init_guest_ram(hwaddr base, size_t size) +{ +if (rme_guest) { +rme_add_ram_region(base, size, /* populate */ false); +} +} + int kvm_arm_rme_vcpu_init(CPUState *cs) { ARMCPU *cpu = ARM_CPU(cs); -- 2.44.0
[PATCH v2 19/22] target/arm/cpu: Inform about reading confidential CPU registers
The host cannot access registers of a Realm. Instead of showing all registers as zero in "info registers", display a message about this restriction. Signed-off-by: Jean-Philippe Brucker --- v1->v2: new --- target/arm/cpu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/target/arm/cpu.c b/target/arm/cpu.c index ab8d007a86..18d1b88e2f 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -1070,6 +1070,11 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE *f, int flags) const char *ns_status; bool sve; +if (cpu->kvm_rme) { +qemu_fprintf(f, "the CPU registers are confidential to the realm\n"); +return; +} + qemu_fprintf(f, " PC=%016" PRIx64 " ", env->pc); for (i = 0; i < 32; i++) { if (i == 31) { -- 2.44.0
[PATCH v2 18/22] target/arm/kvm: Disable Realm reboot
A realm cannot be reset, it must be recreated from scratch. The RMM specification defines states of a Realm as NEW -> ACTIVE -> SYSTEM_OFF, after which the Realm can only be destroyed. A PCSI_SYSTEM_RESET call, which normally reboots the system, puts the Realm in SYSTEM_OFF state. QEMU does not support recreating a VM. Normally, a reboot request by the guest causes all devices to reset, which cannot work for a Realm. Indeed, loading images into Realm memory and changing the PC is only allowed for a Realm in NEW state. Resetting the images for a Realm in SYSTEM_OFF state will cause QEMU to crash with a bus error. Handle reboot requests by the guest more gracefully, by indicating to runstate.c that the vCPUs of a Realm are not resettable, and that QEMU should exit. Reviewed-by: Richard Henderson Signed-off-by: Jean-Philippe Brucker --- target/arm/kvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 9855cadb1b..60c2ef9388 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -1694,7 +1694,8 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) bool kvm_arch_cpu_check_are_resettable(void) { -return true; +/* A Realm cannot be reset */ +return !kvm_arm_rme_enabled(); } static void kvm_arch_get_eager_split_size(Object *obj, Visitor *v, -- 2.44.0
[PATCH v2 06/22] hw/arm/virt: Disable DTB randomness for confidential VMs
The dtb-randomness feature, which adds random seeds to the DTB, isn't really compatible with confidential VMs since it randomizes the Realm Initial Measurement. Enabling it is not an error, but it prevents attestation. It also isn't useful to a Realm, which doesn't trust host input. Currently the feature is automatically enabled, unless the user disables it on the command-line. Change it to OnOffAuto, and automatically disable it for confidential VMs, unless the user explicitly enables it. Signed-off-by: Jean-Philippe Brucker --- v1->v2: separate patch, use OnOffAuto --- docs/system/arm/virt.rst | 9 + include/hw/arm/virt.h| 2 +- hw/arm/virt.c| 41 +--- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/docs/system/arm/virt.rst b/docs/system/arm/virt.rst index 26fcba00b7..e4bbfec662 100644 --- a/docs/system/arm/virt.rst +++ b/docs/system/arm/virt.rst @@ -172,10 +172,11 @@ dtb-randomness rng-seed and kaslr-seed nodes (in both "/chosen" and "/secure-chosen") to use for features like the random number generator and address space randomisation. The default is - ``on``. You will want to disable it if your trusted boot chain - will verify the DTB it is passed, since this option causes the - DTB to be non-deterministic. It would be the responsibility of - the firmware to come up with a seed and pass it on if it wants to. + ``off`` for confidential VMs, and ``on`` otherwise. You will want + to disable it if your trusted boot chain will verify the DTB it is + passed, since this option causes the DTB to be non-deterministic. + It would be the responsibility of the firmware to come up with a + seed and pass it on if it wants to. dtb-kaslr-seed A deprecated synonym for dtb-randomness. diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index bb486d36b1..90a148dac2 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -150,7 +150,7 @@ struct VirtMachineState { bool virt; bool ras; bool mte; -bool dtb_randomness; +OnOffAuto dtb_randomness; OnOffAuto acpi; VirtGICType gic_version; VirtIOMMUType iommu; diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 07ad31876e..f300f100b5 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -259,6 +259,7 @@ static bool ns_el2_virt_timer_present(void) static void create_fdt(VirtMachineState *vms) { +bool dtb_randomness = true; MachineState *ms = MACHINE(vms); int nb_numa_nodes = ms->numa_state->num_nodes; void *fdt = create_device_tree(&vms->fdt_size); @@ -268,6 +269,16 @@ static void create_fdt(VirtMachineState *vms) exit(1); } +/* + * Including random data in the DTB causes random intial measurement on CCA, + * so disable it for confidential VMs. + */ +if (vms->dtb_randomness == ON_OFF_AUTO_OFF || +(vms->dtb_randomness == ON_OFF_AUTO_AUTO && + virt_machine_is_confidential(vms))) { +dtb_randomness = false; +} + ms->fdt = fdt; /* Header */ @@ -278,13 +289,13 @@ static void create_fdt(VirtMachineState *vms) /* /chosen must exist for load_dtb to fill in necessary properties later */ qemu_fdt_add_subnode(fdt, "/chosen"); -if (vms->dtb_randomness) { +if (dtb_randomness) { create_randomness(ms, "/chosen"); } if (vms->secure) { qemu_fdt_add_subnode(fdt, "/secure-chosen"); -if (vms->dtb_randomness) { +if (dtb_randomness) { create_randomness(ms, "/secure-chosen"); } } @@ -2474,18 +2485,21 @@ static void virt_set_its(Object *obj, bool value, Error **errp) vms->its = value; } -static bool virt_get_dtb_randomness(Object *obj, Error **errp) +static void virt_get_dtb_randomness(Object *obj, Visitor *v, const char *name, +void *opaque, Error **errp) { VirtMachineState *vms = VIRT_MACHINE(obj); +OnOffAuto dtb_randomness = vms->dtb_randomness; -return vms->dtb_randomness; +visit_type_OnOffAuto(v, name, &dtb_randomness, errp); } -static void virt_set_dtb_randomness(Object *obj, bool value, Error **errp) +static void virt_set_dtb_randomness(Object *obj, Visitor *v, const char *name, +void *opaque, Error **errp) { VirtMachineState *vms = VIRT_MACHINE(obj); -vms->dtb_randomness = value; +visit_type_OnOffAuto(v, name, &vms->dtb_randomness, errp); } static char *virt_get_oem_id(Object *obj, Error **errp) @@ -3123,16 +3137,16 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) "Set on/off to enable/disable " "ITS instantiation"); -object_class_property_add_bool(oc, "dtb-rand
Re: [PATCH v2 02/22] target/arm: Add confidential guest support
On Fri, Apr 19, 2024 at 05:25:12PM +0100, Daniel P. Berrangé wrote: > On Fri, Apr 19, 2024 at 04:56:50PM +0100, Jean-Philippe Brucker wrote: > > Add a new RmeGuest object, inheriting from ConfidentialGuestSupport, to > > support the Arm Realm Management Extension (RME). It is instantiated by > > passing on the command-line: > > > > -M virt,confidential-guest-support= > > -object guest-rme,id=[,options...] Hm, the commit message is wrong, it should say "rme-guest". > How about either "arm-rme" or merely 'rme' as the object name I don't feel strongly about the name, but picked this one to be consistent with existing confidential-guest-support objects: sev-guest, pef-guest, s390-pv-guest, and upcoming tdx-guest [1] Thanks, Jean [1] https://lore.kernel.org/qemu-devel/20240229063726.610065-13-xiaoyao...@intel.com/
Re: [PATCH for-5.0 v11 02/20] virtio-iommu: Add skeleton
Hi Eric, On Fri, Nov 22, 2019 at 07:29:25PM +0100, Eric Auger wrote: > +typedef struct VirtIOIOMMU { > +VirtIODevice parent_obj; > +VirtQueue *req_vq; > +VirtQueue *event_vq; > +struct virtio_iommu_config config; > +uint64_t features; > +uint64_t acked_features; We already have guest_features in the parent object. > +GHashTable *as_by_busptr; > +IOMMUPciBus *as_by_bus_num[IOMMU_PCI_BUS_MAX]; Doesn't seem used anymore. Thanks, Jean > +PCIBus *primary_bus; > +GTree *domains; > +QemuMutex mutex; > +GTree *endpoints; > +} VirtIOIOMMU; > + > +#endif > -- > 2.20.1 > >
Re: [PATCH for-5.0 v11 03/20] virtio-iommu: Decode the command payload
On Fri, Nov 22, 2019 at 07:29:26PM +0100, Eric Auger wrote: > This patch adds the command payload decoding and > introduces the functions that will do the actual > command handling. Those functions are not yet implemented. > > Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker Which isn't worth much since I don't have prior QEMU experience but I did stare at this code for a while and work on future extensions. Thanks, Jean
Re: [PATCH for-5.0 v11 04/20] virtio-iommu: Add the iommu regions
Two small things below, but looks good overall Reviewed-by: Jean-Philippe Brucker On Fri, Nov 22, 2019 at 07:29:27PM +0100, Eric Auger wrote: > +static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, > + int devfn) > +{ > +VirtIOIOMMU *s = opaque; > +IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); > +static uint32_t mr_index; > +IOMMUDevice *sdev; > + > +if (!sbus) { > +sbus = g_malloc0(sizeof(IOMMUPciBus) + > + sizeof(IOMMUDevice *) * IOMMU_PCI_DEVFN_MAX); > +sbus->bus = bus; > +g_hash_table_insert(s->as_by_busptr, bus, sbus); > +} > + > +sdev = sbus->pbdev[devfn]; > +if (!sdev) { > +char *name = g_strdup_printf("%s-%d-%d", > + TYPE_VIRTIO_IOMMU_MEMORY_REGION, > + mr_index++, devfn); > +sdev = sbus->pbdev[devfn] = g_malloc0(sizeof(IOMMUDevice)); > + > +sdev->viommu = s; > +sdev->bus = bus; > +sdev->devfn = devfn; It might be better to store the endpoint ID in IOMMUDevice, then you could get rid of virtio_iommu_get_sid(), and remove a tiny bit of overhead in virtio_iommu_translate(). But I doubt it's significant. [...] > +static const TypeInfo virtio_iommu_memory_region_info = { > +.parent = TYPE_IOMMU_MEMORY_REGION, > +.name = TYPE_VIRTIO_IOMMU_MEMORY_REGION, > +.class_init = virtio_iommu_memory_region_class_init, > +}; > + > + nit: newline. Thanks, Jean
Re: [PATCH for-5.0 v11 05/20] virtio-iommu: Endpoint and domains structs and helpers
On Fri, Nov 22, 2019 at 07:29:28PM +0100, Eric Auger wrote: > +typedef struct viommu_domain { > +uint32_t id; > +GTree *mappings; > +QLIST_HEAD(, viommu_endpoint) endpoint_list; > +} viommu_domain; > + > +typedef struct viommu_endpoint { > +uint32_t id; > +viommu_domain *domain; > +QLIST_ENTRY(viommu_endpoint) next; > +} viommu_endpoint; There might be a way to merge viommu_endpoint and the IOMMUDevice structure introduced in patch 4, since they both represent one endpoint. Maybe virtio_iommu_find_add_pci_as() could add the IOMMUDevice to s->endpoints, and IOMMUDevice could store the endpoint ID rather than bus and devfn. > +typedef struct viommu_interval { > +uint64_t low; > +uint64_t high; > +} viommu_interval; I guess these should be named in CamelCase? Although if we're allowed to choose my vote goes to underscores :) Thanks, Jean
Re: [PATCH for-5.0 v11 06/20] virtio-iommu: Implement attach/detach command
On Fri, Nov 22, 2019 at 07:29:29PM +0100, Eric Auger wrote: > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > index 235bde2203..138d5b2a9c 100644 > --- a/hw/virtio/virtio-iommu.c > +++ b/hw/virtio/virtio-iommu.c > @@ -77,11 +77,12 @@ static gint interval_cmp(gconstpointer a, gconstpointer > b, gpointer user_data) > static void virtio_iommu_detach_endpoint_from_domain(viommu_endpoint *ep) > { > QLIST_REMOVE(ep, next); > +g_tree_unref(ep->domain->mappings); > ep->domain = NULL; > } > > -viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, uint32_t ep_id); > -viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, uint32_t ep_id) > +static viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, > + uint32_t ep_id) > { > viommu_endpoint *ep; > > @@ -102,15 +103,14 @@ static void virtio_iommu_put_endpoint(gpointer data) > > if (ep->domain) { > virtio_iommu_detach_endpoint_from_domain(ep); > -g_tree_unref(ep->domain->mappings); > } > > trace_virtio_iommu_put_endpoint(ep->id); > g_free(ep); > } > > -viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s, uint32_t domain_id); > -viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s, uint32_t domain_id) > +static viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s, > + uint32_t domain_id) Looks like the above change belong to patch 5? > { > viommu_domain *domain; > > @@ -137,7 +137,6 @@ static void virtio_iommu_put_domain(gpointer data) > QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) { > virtio_iommu_detach_endpoint_from_domain(iter); > } > -g_tree_destroy(domain->mappings); When created by virtio_iommu_get_domain(), mappings has one reference. Then for each attach (including the first one) an additional reference is taken, and freed by virtio_iommu_detach_endpoint_from_domain(). So I think there are two problems: * virtio_iommu_put_domain() drops one ref for each endpoint, but we still have one reference to mappings, so they're not freed. We do need this g_tree_destroy() * After detaching all the endpoints, the guest may reuse the domain ID for another domain, but the previous mappings haven't been erased. Not sure how to fix this using the g_tree refs, because dropping all the references will free the internal tree data and it won't be reusable. Thanks, Jean
Re: [PATCH for-5.0 v11 07/20] virtio-iommu: Implement map/unmap
On Fri, Nov 22, 2019 at 07:29:30PM +0100, Eric Auger wrote: > @@ -238,10 +244,35 @@ static int virtio_iommu_map(VirtIOIOMMU *s, > uint64_t virt_start = le64_to_cpu(req->virt_start); > uint64_t virt_end = le64_to_cpu(req->virt_end); > uint32_t flags = le32_to_cpu(req->flags); > +viommu_domain *domain; > +viommu_interval *interval; > +viommu_mapping *mapping; Additional checks would be good. Most importantly we need to return S_INVAL if we don't recognize a bit in flags (a MUST in the spec). It might be good to check that addresses are aligned on the page granule as well, and return S_RANGE if they aren't (a SHOULD in the spec), but I don't care as much. > + > +interval = g_malloc0(sizeof(*interval)); > + > +interval->low = virt_start; > +interval->high = virt_end; > + > +domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id)); > +if (!domain) { > +return VIRTIO_IOMMU_S_NOENT; Leaks interval, I guess you could allocate it after this block. Thanks, Jean > +} > + > +mapping = g_tree_lookup(domain->mappings, (gpointer)interval); > +if (mapping) { > +g_free(interval); > +return VIRTIO_IOMMU_S_INVAL; > +} > > trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, > flags); > > -return VIRTIO_IOMMU_S_UNSUPP; > +mapping = g_malloc0(sizeof(*mapping)); > +mapping->phys_addr = phys_start; > +mapping->flags = flags; > + > +g_tree_insert(domain->mappings, interval, mapping); > + > +return VIRTIO_IOMMU_S_OK;
Re: [PATCH for-5.0 v11 08/20] virtio-iommu: Implement translate
On Fri, Nov 22, 2019 at 07:29:31PM +0100, Eric Auger wrote: > This patch implements the translate callback > > Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker
Re: [PATCH for-5.0 v11 09/20] virtio-iommu: Implement fault reporting
On Fri, Nov 22, 2019 at 07:29:32PM +0100, Eric Auger wrote: > @@ -443,6 +489,8 @@ static IOMMUTLBEntry > virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, > if (!ep) { > if (!bypass_allowed) { > error_report_once("%s sid=%d is not known!!", __func__, sid); > +virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN, > + 0, sid, 0); I guess we could report the faulting address as well, it can be useful for diagnostics. > } else { > entry.perm = flag; > } > @@ -455,6 +503,8 @@ static IOMMUTLBEntry > virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, >"%s %02x:%02x.%01x not attached to any domain\n", >__func__, PCI_BUS_NUM(sid), >PCI_SLOT(sid), PCI_FUNC(sid)); > +virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN, > + 0, sid, 0); Here as well, especially since that error would get propagated by a linux guest to the device driver > } else { > entry.perm = flag; > } > @@ -468,16 +518,25 @@ static IOMMUTLBEntry > virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, > qemu_log_mask(LOG_GUEST_ERROR, >"%s no mapping for 0x%"PRIx64" for sid=%d\n", >__func__, addr, sid); > +virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING, > + 0, sid, addr); Flag VIRTIO_IOMMU_FAULT_F_ADDRESS denotes a valid address field Thanks, Jean
Re: [PATCH for-5.0 v11 10/20] virtio-iommu-pci: Add virtio iommu pci support
On Fri, Nov 22, 2019 at 07:29:33PM +0100, Eric Auger wrote: > This patch adds virtio-iommu-pci, which is the pci proxy for > the virtio-iommu device. > > Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker
Re: [PATCH for-5.0 v11 14/20] virtio-iommu: Handle reserved regions in the translation process
On Fri, Nov 22, 2019 at 07:29:37PM +0100, Eric Auger wrote: > +for (i = 0; i < s->nb_reserved_regions; i++) { > +if (interval.low >= s->reserved_regions[i].low && > +interval.low <= s->reserved_regions[i].high) { > +switch (s->reserved_regions[i].type) { > +case VIRTIO_IOMMU_RESV_MEM_T_MSI: > +entry.perm = flag; > +goto unlock; > +case VIRTIO_IOMMU_RESV_MEM_T_RESERVED: > +default: > +virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING, > + 0, sid, addr); Needs the VIRTIO_IOMMU_FAULT_F_ADDRESS flag. Thanks, Jean
Re: [PATCH for-5.0 v11 15/20] virtio-iommu-pci: Add array of Interval properties
On Fri, Nov 22, 2019 at 07:29:38PM +0100, Eric Auger wrote: > The machine may need to pass reserved regions to the > virtio-iommu-pci device (such as the MSI window on x86). > So let's add an array of Interval properties. > > Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker
Re: [PATCH for-5.0 v11 18/20] virtio-iommu: Support migration
On Fri, Nov 22, 2019 at 07:29:41PM +0100, Eric Auger wrote: > +static const VMStateDescription vmstate_virtio_iommu_device = { > +.name = "virtio-iommu-device", > +.minimum_version_id = 1, > +.version_id = 1, > +.post_load = iommu_post_load, > +.fields = (VMStateField[]) { > +VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 1, > + &vmstate_domain, viommu_domain), > +VMSTATE_GTREE_DIRECT_KEY_V(endpoints, VirtIOIOMMU, 1, > + &vmstate_endpoint, viommu_endpoint), So if I understand correctly these fields are state that is modified by the guest? We don't need to save/load fields that cannot be modified by the guest, static information that is created from the QEMU command-line. I think the above covers everything we need to migrate in VirtIOIOMMU then, except for acked_features, which (as I pointed out on another patch) seems redundant anyway since there is vdev->guest_features. Reviewed-by: Jean-Philippe Brucker > +VMSTATE_END_OF_LIST() > +}, > +}; > + > + > static const VMStateDescription vmstate_virtio_iommu = { > .name = "virtio-iommu", > .minimum_version_id = 1, > -- > 2.20.1 > >
Re: [PATCH for-5.0 v11 11/20] hw/arm/virt: Add the virtio-iommu device tree mappings
On Fri, Nov 22, 2019 at 07:29:34PM +0100, Eric Auger wrote: > diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c > index 280230b31e..4cfae1f9df 100644 > --- a/hw/virtio/virtio-iommu-pci.c > +++ b/hw/virtio/virtio-iommu-pci.c > @@ -31,9 +31,6 @@ struct VirtIOIOMMUPCI { > > static Property virtio_iommu_pci_properties[] = { > DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), > -DEFINE_PROP_ARRAY("reserved-regions", VirtIOIOMMUPCI, > - vdev.nb_reserved_regions, vdev.reserved_regions, > - qdev_prop_interval, Interval), Belongs in patch 10? Apart from that Reviewed-by: Jean-Philippe Brucker
Re: [PATCH for-5.0 v11 16/20] hw/arm/virt-acpi-build: Introduce fill_iort_idmap helper
On Fri, Nov 22, 2019 at 07:29:39PM +0100, Eric Auger wrote: > build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) > { > @@ -426,13 +437,12 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > smmu->gerr_gsiv = cpu_to_le32(irq + 2); > smmu->sync_gsiv = cpu_to_le32(irq + 3); > > -/* Identity RID mapping covering the whole input RID range */ > -idmap = &smmu->id_mapping_array[0]; > -idmap->input_base = 0; > -idmap->id_count = cpu_to_le32(0x); > -idmap->output_base = 0; > -/* output IORT node is the ITS group node (the first node) */ > -idmap->output_reference = cpu_to_le32(iort_node_offset); > +/* > + * Identity RID mapping covering the whole input RID range. > + * The output IORT node is the ITS group node (the first node). > + */ > +fill_iort_idmap(smmu->id_mapping_array, 0, 0, 0x, 0, nit: the other calls use uppercase hex digits Reviewed-by: Jean-Philippe Brucker
Re: [PATCH for-5.0 v11 13/20] virtio-iommu: Implement probe request
On Fri, Nov 22, 2019 at 07:29:36PM +0100, Eric Auger wrote: > This patch implements the PROBE request. At the moment, > no reserved regions are returned as none are registered > per device. Only a NONE property is returned. > > Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker
Re: [PATCH for-5.0 v11 17/20] hw/arm/virt-acpi-build: Add virtio-iommu node in IORT table
On Fri, Nov 22, 2019 at 07:29:40PM +0100, Eric Auger wrote: > This patch builds the virtio-iommu node in the ACPI IORT table. > > The RID space of the root complex, which spans 0x0-0x1 > maps to streamid space 0x0-0x1 in the virtio-iommu which in > turn maps to deviceid space 0x0-0x1 in the ITS group. > > The iommu RID is excluded as described in virtio-iommu > specification. > > Signed-off-by: Eric Auger Reviewed-by: Jean-Philippe Brucker Although VIOT changes the layout of the IORT node slightly, the implementation should stay pretty much the same.
Re: [PATCH for-5.0 v11 19/20] pc: Add support for virtio-iommu-pci
On Fri, Nov 22, 2019 at 07:29:42PM +0100, Eric Auger wrote: > The virtio-iommu-pci is instantiated through the -device QEMU > option. However if instantiated it also requires an IORT ACPI table > to describe the ID mappings between the root complex and the iommu. > > This patch adds the generation of the IORT table if the > virtio-iommu-pci device is instantiated. > > We also declare the [0xfee0 - 0xfeef] MSI reserved region > so that it gets bypassed by the IOMMU. > > Signed-off-by: Eric Auger It would be nice to factor the IORT code with arm, but this looks OK. Reviewed-by: Jean-Philippe Brucker
[PATCH] virtio-mmio: Clear v2 transport state on soft reset
At the moment when the guest writes a status of 0, we only reset the virtio core state but not the virtio-mmio state. The virtio-mmio specification says (v1.1 cs01, 4.2.2.1 Device Requirements: MMIO Device Register Layout): Upon reset, the device MUST clear all bits in InterruptStatus and ready bits in the QueueReady register for all queues in the device. The core already takes care of InterruptStatus by clearing isr, but we still need to clear QueueReady. It would be tempting to clean all registers, but since the specification doesn't say anything more, guests could rely on the registers keeping their state across reset. Linux for example, relies on this for GuestPageSize in the legacy MMIO tranport. Fixes: 44e687a4d9ab ("virtio-mmio: implement modern (v2) personality (virtio-1)") Signed-off-by: Jean-Philippe Brucker --- This fixes kexec of a Linux guest that uses the modern virtio-mmio transport. --- hw/virtio/virtio-mmio.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 94d934c44b..ef40b7a9b2 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -65,6 +65,19 @@ static void virtio_mmio_stop_ioeventfd(VirtIOMMIOProxy *proxy) virtio_bus_stop_ioeventfd(&proxy->bus); } +static void virtio_mmio_soft_reset(VirtIOMMIOProxy *proxy) +{ +int i; + +if (proxy->legacy) { +return; +} + +for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { +proxy->vqs[i].enabled = 0; +} +} + static uint64_t virtio_mmio_read(void *opaque, hwaddr offset, unsigned size) { VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque; @@ -378,6 +391,7 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value, if (vdev->status == 0) { virtio_reset(vdev); +virtio_mmio_soft_reset(proxy); } break; case VIRTIO_MMIO_QUEUE_DESC_LOW: -- 2.24.0
Re: [PATCH v14 08/11] virtio-iommu-pci: Introduce the x-dt-binding option
Hi Eric, On Fri, Feb 07, 2020 at 10:32:00AM +0100, Eric Auger wrote: > At the moment, the kernel only supports device tree > integration of the virtio-iommu. DT bindings between the > PCI root complex and the IOMMU must be created by the machine > in conformance to: > > Documentation/devicetree/bindings/virtio/iommu.txt. > > To make sure the end-user is aware of this, force him to use the > temporary device option "x-dt-binding" and also double check the > machine has a hotplug handler for the virtio-iommu-pci device. > This hotplug handler is in charge of creating those DT bindings. > > Signed-off-by: Eric Auger > Suggested-by: Michael S. Tsirkin [...] > @@ -39,6 +42,21 @@ static void virtio_iommu_pci_realize(VirtIOPCIProxy > *vpci_dev, Error **errp) > VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(vpci_dev); > DeviceState *vdev = DEVICE(&dev->vdev); > > +if (!dev->dt_binding) { > +error_setg(errp, > + "Instantiation currently only is possible if the machine " > + "creates device tree iommu-map bindings, ie. ACPI is not " > + "yet supported"); > +error_append_hint(errp, "use -virtio-iommu-pci,x-dt-binding\n"); "use -device virtio-iommu-pci,x-dt-binding"? Can the option be safely removed as soon as we implement a topology description for the remaining platforms? Or will we need to carry it forever for backward-compatibility (ie. ensure that an old command-line invocation that contains this option still works)? Thanks, Jean
Re: [PATCH v14 08/11] virtio-iommu-pci: Introduce the x-dt-binding option
On Fri, Feb 07, 2020 at 11:51:55AM +0100, Auger Eric wrote: > Hi, > > On 2/7/20 11:23 AM, Michael S. Tsirkin wrote: > > On Fri, Feb 07, 2020 at 10:32:00AM +0100, Eric Auger wrote: > >> At the moment, the kernel only supports device tree > >> integration of the virtio-iommu. DT bindings between the > >> PCI root complex and the IOMMU must be created by the machine > >> in conformance to: > >> > >> Documentation/devicetree/bindings/virtio/iommu.txt. > >> > >> To make sure the end-user is aware of this, force him to use the > >> temporary device option "x-dt-binding" and also double check the > >> machine has a hotplug handler for the virtio-iommu-pci device. > >> This hotplug handler is in charge of creating those DT bindings. > >> > >> Signed-off-by: Eric Auger > >> Suggested-by: Michael S. Tsirkin > > > > how about setting it by default from machine class? > Do you mean in ARM virt machine class? But this wouldn't prevent a user > from launching an ACPI booted guest. I thought you wanted the end-user > to know what he does. > > I don't figure out a way to know if the guest is booted in dt or acpi > mode. I can get access to those info: > - whether acpi is enabled That's the default on virt machine right? Then if there is a FW it can choose either ACPI or DT? > - whether a FW is loaded > > But a FW can be loaded, acpi enabled and eventually the guest is DT > booted with acpi=off in kernel opts. > > Maybe at this point I could only support the case where no FW is loaded. > In machvirt I would not register the virtio-iommu-pci hotplug handler in > case a FW is loaded. Then I could get rid of the new x-dt-binding prop. > > Thoughts? Yes, I'm hoping we can get the topology description into next version of Linux, and it would be nicer not to introduce backward-compatible baggage for something that should be solved within a few months. If we have to warn the user then checking the FW seems like a good compromise, and easy to remove later. Thanks, Jean > > Eric > > See > > [PATCH 1/2] spapr: Disable legacy virtio devices for pseries-5.0 and later > > [PATCH 2/2] spapr: Enable virtio iommu_platform=on by default > > which does it for spapr. > > > > >> --- > >> > >> May be squashed with previous patch > >> --- > >> hw/virtio/virtio-iommu-pci.c | 18 ++ > >> 1 file changed, 18 insertions(+) > >> > >> diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c > >> index d539fcce75..3d06e14000 100644 > >> --- a/hw/virtio/virtio-iommu-pci.c > >> +++ b/hw/virtio/virtio-iommu-pci.c > >> @@ -14,6 +14,7 @@ > >> #include "virtio-pci.h" > >> #include "hw/virtio/virtio-iommu.h" > >> #include "hw/qdev-properties.h" > >> +#include "qapi/error.h" > >> > >> typedef struct VirtIOIOMMUPCI VirtIOIOMMUPCI; > >> > >> @@ -27,10 +28,12 @@ typedef struct VirtIOIOMMUPCI VirtIOIOMMUPCI; > >> struct VirtIOIOMMUPCI { > >> VirtIOPCIProxy parent_obj; > >> VirtIOIOMMU vdev; > >> +bool dt_binding; > >> }; > >> > >> static Property virtio_iommu_pci_properties[] = { > >> DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), > >> +DEFINE_PROP_BOOL("x-dt-binding", VirtIOIOMMUPCI, dt_binding, false), > >> DEFINE_PROP_END_OF_LIST(), > >> }; > >> > >> @@ -39,6 +42,21 @@ static void virtio_iommu_pci_realize(VirtIOPCIProxy > >> *vpci_dev, Error **errp) > >> VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(vpci_dev); > >> DeviceState *vdev = DEVICE(&dev->vdev); > >> > >> +if (!dev->dt_binding) { > >> +error_setg(errp, > >> + "Instantiation currently only is possible if the > >> machine " > >> + "creates device tree iommu-map bindings, ie. ACPI is > >> not " > >> + "yet supported"); > >> +error_append_hint(errp, "use -virtio-iommu-pci,x-dt-binding\n"); > >> +return; > >> +} > >> + > >> +if (!qdev_get_machine_hotplug_handler(DEVICE(vpci_dev))) { > >> +error_setg(errp, > >> + "The machine does not implement a virtio-iommu-pci > >> hotplug " > >> + " handler that creates the device tree iommu-map > >> bindings"); > >> + return; > >> +} > >> qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); > >> object_property_set_link(OBJECT(dev), > >> OBJECT(pci_get_bus(&vpci_dev->pci_dev)), > >> -- > >> 2.20.1 > > >
Re: virtio-iommu issue with VFIO device downstream to a PCIe-to-PCI bridge: VFIO devices are not assigned any iommu group
Hi, On Mon, Jan 09, 2023 at 10:11:19PM +0100, Eric Auger wrote: > > Jean, do you have any idea about how to fix that? Do you think we have a > > trouble in the acpi/viot setup or virtio-iommu probe sequence. It looks > > like virtio probe and attach commands are called too early, before the > > bus is actually correctly numbered. > > So after further investigations looks this is not a problem of bus > number, which is good at the time of the virtio cmd calls but rather a > problem related to the devfn (0 was used when creating the IOMMU MR) > whereas the virtio-iommu cmds looks for the non aliased devfn. With that > fixed, the probe and attach at least succeeds. The device still does not > work for me but I will continue my investigations and send a tentative fix. If I remember correctly VIOT can deal with bus numbers because bridges are assigned a range by QEMU, but I haven't tested that in detail, and I don't know how it holds with conventional PCI bridges. Do you have an example command-line I could use to experiment (and the fix you're mentioning)? Thanks, Jean
Re: virtio-iommu issue with VFIO device downstream to a PCIe-to-PCI bridge: VFIO devices are not assigned any iommu group
On Fri, Jan 13, 2023 at 10:57:00AM -0700, Alex Williamson wrote: > On Fri, 13 Jan 2023 12:39:18 + > Jean-Philippe Brucker wrote: > > > Hi, > > > > On Mon, Jan 09, 2023 at 10:11:19PM +0100, Eric Auger wrote: > > > > Jean, do you have any idea about how to fix that? Do you think we have a > > > > trouble in the acpi/viot setup or virtio-iommu probe sequence. It looks > > > > like virtio probe and attach commands are called too early, before the > > > > bus is actually correctly numbered. > > > > > > So after further investigations looks this is not a problem of bus > > > number, which is good at the time of the virtio cmd calls but rather a > > > problem related to the devfn (0 was used when creating the IOMMU MR) > > > whereas the virtio-iommu cmds looks for the non aliased devfn. With that > > > fixed, the probe and attach at least succeeds. The device still does not > > > work for me but I will continue my investigations and send a tentative > > > fix. > > > > If I remember correctly VIOT can deal with bus numbers because bridges are > > assigned a range by QEMU, but I haven't tested that in detail, and I don't > > know how it holds with conventional PCI bridges. > > In my reading of the virtio-iommu spec, Hm, is that the virtio-iommu spec or ACPI VIOT/device tree spec? The virtio-iommu spec shouldn't refer to PCI buses at the moment. The intent is that for PCI, the "endpoint ID" passed in an ATTACH request corresponds to PCI segment and RID of PCI devices at the time of the request (so after the OS renumbered the buses). If you found something in the spec that contradicts this, it should be fixed. Note that "endpoint" is a misnomer, it can refer to PCI bridges as well, anything that can issue DMA transactions. > I noted that it specifies the > bus numbers *at the time of OS handoff*, so it essentially washes its > hands of the OS renumbering buses while leaving subtle dependencies on > initial numbering in the guest and QEMU implementations. Yes we needed to describe in the firmware tables (device-tree and ACPI VIOT) which devices the IOMMU manages. And at the time we generate the tables, if we want to refer to PCI devices behind bridges, we can either use catch-all ranges for any possible bus numbers they will get, or initialize bus numbers in bridges and pass those to the OS. But that's only to communicate the IOMMU topology to the OS, because we couldn't come up with anything better. After it sets up PCI the OS should be able to use its own configuration of the PCI topology in virtio-iommu requests. > On bare metal, a conventional bridge aliases the devices downstream of > it. We reflect that in QEMU by aliasing those devices to the > AddressSpace of the bridge. IIRC, Linux guests will use a > for-each-dma-alias function when programming IOMMU translation tables > to populate the bridge alias, where a physical IOMMU would essentially > only see that bridge RID. Thanks, Yes there might be something missing in the Linux driver, I'll have a look Thanks, Jean
Re: [RFC] virtio-iommu: Take into account possible aliasing in virtio_iommu_mr()
Hi Eric, On Mon, Jan 16, 2023 at 07:47:09AM -0500, Eric Auger wrote: [...] > once we attempt to plug such devices downstream to the pcie-to-pci > bridge, those devices are put in a singleton group. The pcie-to-pci > bridge disappears from the radar (not attached to any group), and the > pcie root port the pcie-to-pci bridge is plugged onto is put in a > separate singleton group. So the topology is wrong and definitively > different from the one observed with the intel iommu. > > I suspect some other issue in the viot/pci probing on guest kernel > side. I would appreciate on that kernel part. > > qemu command excerpt: > for x86_64: > > -device ioh3420,bus=pcie.0,chassis=1,id=pcie.1,addr=0x2.0x0 > -device pcie-pci-bridge,id=pcie_pci_bridge1,bus=pcie.1 > -netdev tap,id=mynet0,ifname=tap0,script=qemu-ifup,downscript=qemu-ifdown > -device e1000,netdev=mynet0,bus=pcie_pci_bridge1 > > guest pci topology: > > -[:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller >+-01.0 Device 1234: >+-02.0-[01-02]00.0-[02]00.0 Intel Corporation 82540EM > Gigabit Ethernet Controller > > wrong guest topology: > /sys/kernel/iommu_groups/2/devices/:00:02.0 (pcie root port) > /sys/kernel/iommu_groups/1/devices/:02:00.0 (E1000) > no group for :01:00:0 (the pcie-to-pci bridge) This highlights several problems in the kernel: (1) The pcie-to-pci bridge doesn't get an IOMMU group. That's a bug in the VIOT driver, where we use the wrong struct device when dealing with PCI aliases. I'll send a fix. (2) e1000 gets a different group than the pcie-to-pci bridge, and than other devices on that bus. This wrongly enables assigning aliased devices to different VMs. It's not specific to virtio-iommu, I can get the same result with SMMUv3, both DT and ACPI: qemu-system-aarch64 -M virt,gic-version=3,its=on,iommu=smmuv3 -cpu max -m 4G -smp 8 -kernel Image -initrd initrd -append console=ttyAMA0 -nographic \ -device pcie-root-port,chassis=1,id=pcie.1,bus=pcie.0 \ -device pcie-pci-bridge,id=pcie.2,bus=pcie.1 \ -device e1000,netdev=net0,bus=pcie.2,addr=1.0 -netdev user,id=net0 \ -device e1000,netdev=net1,bus=pcie.2,addr=2.0 -netdev user,id=net1 I think the logic in pci_device_group() expects the devices to be probed in a specific top-down order, so that when we get to a device, all its parents already have a group. Starting with arbitrary devices would require walking down and across the tree to find aliases which is too complex. As you noted Intel IOMMU doesn't have this problem, because probing happens in the expected order. The driver loads at the right time and bus_iommu_probe() ends up calling pci_device_group() in the right order. For virtio-iommu and SMMU, the drivers are subject to probe deferral, and devices drivers are bound kinda randomly by the driver core. In that case it's acpi/of_iommu_configure() that calls pci_device_group(). I'm not sure what the best approach is to fix this. It seems wrong to rely on previous driver probes in pci_device_group() at all, because even if you probe in the right order, you could build a kernel without the PCIe port driver and the aliased endpoints will still be put in different groups. Maybe pci_device_group(), when creating a new group, should immediately assign that group to all parents that alias the device? Or maybe we can reuse the RID alias infrastructure used by quirks. (3) with the SMMU, additional devices on the PCI bus can't get an IOMMU group: [2.019587] e1000 :02:01.0: Adding to iommu group 0 [2.389750] e1000 :02:02.0: stream 512 already in tree Due to cdf315f907d4 ("iommu/arm-smmu-v3: Maintain a SID->device structure"), the SMMUv3 driver doesn't support RID aliasing anymore. The structure needs to be extended with multiple devices per SID, in which case the fault handler will choose an arbitrary device associated to the faulting SID. (4) When everything else works, on Arm ACPI the PCIe root port is put in a separate group due to ACS being enabled, but on other platforms (including Arm DT), the root port is in the same group. I haven't looked into that. > with intel iommu we get the following topology: > /sys/kernel/iommu_groups/2/devices/:02:00.0 > /sys/kernel/iommu_groups/2/devices/:01:00.0 > /sys/kernel/iommu_groups/2/devices/:00:02.0 > > on aarch64 we get the same using those devices: > -device pcie-root-port,bus=pcie.0,chassis=1,id=pcie.1,addr=0x2.0x0 > -device pcie-pci-bridge,id=pcie_pci_bridge1,bus=pcie.1 > -netdev tap,id=mynet0,ifname=tap0,script=qemu-ifup,downscript=qemu-ifdown > -device e1000,netdev=mynet0,bus=pcie_pci_bridge1 > > Signed-off-by: Eric Auger > --- > hw/virtio/virtio-iommu.c | 15 +++ > 1 file changed, 15 insertions(+) > > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > index 23c470977e..58c367b744 100644 > --- a/hw/virtio/virtio-iommu.c > +++ b/hw/virtio/virtio-iommu.c > @@ -178
Re: virtio-iommu issue with VFIO device downstream to a PCIe-to-PCI bridge: VFIO devices are not assigned any iommu group
On Wed, Jan 18, 2023 at 11:28:32AM -0700, Alex Williamson wrote: > The VT-d spec[2](8.3.1) has a more elegant solution using a path > described in a device scope, based on a root bus number (not > susceptible to OS renumbering) and a sequence of devfns to uniquely > describe a hierarchy or endpoint, invariant of OS bus renumbering. That's a good idea, we could describe the hierarchy using only devfns. I think I based VIOT mostly on IORT and device-tree which don't provide that as far as I know, but could have studied DMAR better. One problem is that for virtio-iommu we'd need to update both device-tree and VIOT (and neither are easy to change). But it's worth thinking about because it would solve a problem we currently have, that a virtio-iommu using the virtio-pci transport cannot be placed behind a bridge, including a root port, because the firmware tables cannot refer to it. Thanks, Jean
Re: [RFC] virtio-iommu: Take into account possible aliasing in virtio_iommu_mr()
On Fri, Jan 20, 2023 at 03:50:18PM +, Robin Murphy wrote: > On 2023-01-20 15:28, Jean-Philippe Brucker wrote: > > For some reason this came through as blank mail with a text attachment, Ugh sorry about that, looks like I hit ^D in mutt before sending > so apologies for the lack of quoting, but for reference this is a > long-standing known issue: > > https://lore.kernel.org/linux-iommu/9625faf4-48ef-2dd3-d82f-931d9cf26...@huawei.com/ > > In summary, yes, hanging {of,acpi}_iommu_configure() off driver probe is > massively wrong, and pulling that step into iommu_probe_device() in a sane > and robust manner is one of the next things to start on after getting the > bus ops out of the way. Right, thanks for confirming Thanks, Jean
[RFC PATCH 07/16] target/arm/kvm: Select RME VM type for the scratch VM
Although the VM type does not affect values probed from the scratch vCPU at the moment, it could later. Ensure we specify the right type when creating the temporary VM. Signed-off-by: Jean-Philippe Brucker --- Does the PA size need changing as well? --- target/arm/kvm.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index fcddead4fe..d8655d9041 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -70,6 +70,7 @@ bool kvm_arm_create_scratch_host_vcpu(const uint32_t *cpus_to_try, { int ret = 0, kvmfd = -1, vmfd = -1, cpufd = -1; int max_vm_pa_size; +int vm_type; kvmfd = qemu_open_old("/dev/kvm", O_RDWR); if (kvmfd < 0) { @@ -79,8 +80,10 @@ bool kvm_arm_create_scratch_host_vcpu(const uint32_t *cpus_to_try, if (max_vm_pa_size < 0) { max_vm_pa_size = 0; } + +vm_type = kvm_arm_rme_vm_type(MACHINE(qdev_get_machine())); do { -vmfd = ioctl(kvmfd, KVM_CREATE_VM, max_vm_pa_size); +vmfd = ioctl(kvmfd, KVM_CREATE_VM, vm_type | max_vm_pa_size); } while (vmfd == -1 && errno == EINTR); if (vmfd < 0) { goto err; -- 2.39.0
[RFC PATCH 00/16] arm: Run Arm CCA VMs with KVM
The Arm Realm Management Extension (RME), part of the Arm Confidential Computing Architecture (CCA), enables running confidential virtual machines in a new "Realm" security state. While the host still manages the resources of a guest running in a Realm, it cannot access them. This series adds some support to QEMU for launching a guest in a Realm with KVM. The KVM changes for CCA have been posted at [1]. Patches 2-4 introduce a new RmeGuest object that inherits from ConfidentialGuestSupport and enable it for the virt machine. Like other confidential guest architectures, launching a Realm VM requires two command-line parameters: -object rme-guest,id=, -M confidential-guest-support= Patches 5-6 modify the KVM vCPU support. With CCA, KVM does not sit atop the VM anymore but talks to a new component, the Realm Management Monitor (RMM) which deals with the Realm stage-2 page tables and CPU state. So KVM cannot access most vCPU registers anymore except for passing parameters to RMM when handling VM exits. Likewise, the host must not access any memory assigned to the guest (or else it gets a granule protection fault). The private memfd work [2] by Chao Peng will help with this. Patches 8-9 deal with loading images into the Realm. Those are measured by the RMM and part of the initial measurement, which allows a Realm owner to attest that the Realm is running what it expects. Patches 10-14 pass parameters described in the RMM specification. This initial posting only provides direct kernel boot with DTB, not firmware boot. There is ongoing work to extend edk2 to run in a Realm, which will require changes to QEMU. A few problems will come up: * The FwCfg device provides kernel images, initrd, ACPI tables etc. This isn't an option for CCA because the guest does not trust what the host provides at runtime. I suggest to load all those things in Realm memory before boot, and pass their address in the device tree which is always present at the start of RAM. This will require new properties in the device-tree's chosen section. * The guest firmware probably shouldn't be on an emulated flash device. For one thing, it doesn't need flash because it will store all variable in RAM. The flash device also relies on read-only mappings which are not supported by KVM RME at the moment, and trapping reads would break integrity. I suggest to either replace the flash device (address 0 of the virt machine) by RAM when RmeGuest is enabled, or load the firmware somewhere else in RAM. Please see [1] for additional resource, including instructions for building and running the CCA software stack on a model. An example command-line: qemu-system-aarch64 -M virt -cpu host -enable-kvm -M gic-version=3 -smp 2 -m 256M -nographic -M confidential-guest-support=rme0 -object rme-guest,id=rme0,measurement-algo=sha512 -kernel Image -initrd rootfs.cpio -append 'console=ttyAMA0 earlycon' -overcommit mem-lock=on A branch with these patches is available at [3]. [1] https://lore.kernel.org/kvm/20230127112248.136810-1-suzuki.poul...@arm.com/ [2] https://lore.kernel.org/qemu-devel/20221202061347.1070246-1-chao.p.p...@linux.intel.com/ [3] https://jpbrucker.net/git/qemu cca/rfc-v1 Jean-Philippe Brucker (16): NOMERGE: Add KVM Arm RME definitions to Linux headers target/arm: Add confidential guest support target/arm/kvm-rme: Initialize realm hw/arm/virt: Add support for Arm RME target/arm/kvm: Split kvm_arch_get/put_registers target/arm/kvm-rme: Initialize vCPU target/arm/kvm: Select RME VM type for the scratch VM target/arm/kvm-rme: Populate the realm with boot images hw/arm/boot: Populate realm memory with boot images target/arm/kvm-rme: Add measurement algorithm property target/arm/kvm-rme: Add Realm Personalization Value parameter target/arm/kvm-rme: Add Realm SVE vector length target/arm/kvm-rme: Add breakpoints and watchpoints parameters target/arm/kvm-rme: Add PMU num counters parameters target/arm/kvm: Disable Realm reboot target/arm/kvm-rme: Disable readonly mappings docs/system/confidential-guest-support.rst | 1 + qapi/qom.json | 32 +- include/sysemu/kvm.h | 2 + linux-headers/asm-arm64/kvm.h | 63 +++ linux-headers/linux/kvm.h | 21 +- target/arm/cpu.h | 3 + target/arm/kvm_arm.h | 21 + accel/kvm/kvm-all.c| 8 +- hw/arm/boot.c | 10 +- hw/arm/virt.c | 48 +- target/arm/helper.c| 8 + target/arm/kvm-rme.c | 505 + target/arm/kvm.c | 20 +- target/arm/kvm64.c | 91 +++- target/arm/meson.build | 7 +- 15 files
[RFC PATCH 12/16] target/arm/kvm-rme: Add Realm SVE vector length
The Realm configuration takes a SVE enable and vector length parameter. We cannot reuse the -cpu SVE parameters for this because that information is needed at Realm Descriptor creation which must happen before VCPU creation. Signed-off-by: Jean-Philippe Brucker --- qapi/qom.json| 5 +++- target/arm/kvm-rme.c | 68 +++- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/qapi/qom.json b/qapi/qom.json index a012281628..94ecb87f6f 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -864,11 +864,14 @@ # # @personalization-value: Realm personalization value (default: 0) # +# @sve-vector-length: SVE vector length (default: 0, SVE disabled) +# # Since: FIXME ## { 'struct': 'RmeGuestProperties', 'data': { '*measurement-algo': 'str', -'*personalization-value': 'str' } } +'*personalization-value': 'str', +'*sve-vector-length': 'uint32' } } ## # @ObjectType: diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index e974c27e5c..0b2153a45c 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -22,7 +22,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) #define RME_PAGE_SIZE qemu_real_host_page_size() -#define RME_MAX_CFG 2 +#define RME_MAX_CFG 3 typedef struct RmeGuest RmeGuest; @@ -30,6 +30,7 @@ struct RmeGuest { ConfidentialGuestSupport parent_obj; char *measurement_algo; char *personalization_value; +uint32_t sve_vl; }; struct RmeImage { @@ -137,6 +138,13 @@ static int rme_configure_one(RmeGuest *guest, uint32_t cfg, Error **errp) } cfg_str = "personalization value"; break; +case KVM_CAP_ARM_RME_CFG_SVE: +if (!guest->sve_vl) { +return 0; +} +args.sve_vq = guest->sve_vl / 128; +cfg_str = "SVE"; +break; default: g_assert_not_reached(); } @@ -346,6 +354,52 @@ static void rme_set_rpv(Object *obj, const char *value, Error **errp) guest->personalization_value = g_strdup(value); } +static void rme_get_uint32(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); +uint32_t value; + +if (strcmp(name, "sve-vector-length") == 0) { +value = guest->sve_vl; +} else { +g_assert_not_reached(); +} + +visit_type_uint32(v, name, &value, errp); +} + +static void rme_set_uint32(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); +uint32_t max_value; +uint32_t value; +uint32_t *var; + +if (!visit_type_uint32(v, name, &value, errp)) { +return; +} + +if (strcmp(name, "sve-vector-length") == 0) { +max_value = ARM_MAX_VQ * 128; +var = &guest->sve_vl; +if (value & 0x7f) { +error_setg(errp, "invalid SVE vector length %"PRIu32, value); +return; +} +} else { +g_assert_not_reached(); +} + +if (value >= max_value) { +error_setg(errp, "invalid %s length %"PRIu32, name, value); +return; +} + +*var = value; +} + static void rme_guest_class_init(ObjectClass *oc, void *data) { object_class_property_add_str(oc, "measurement-algo", @@ -358,6 +412,18 @@ static void rme_guest_class_init(ObjectClass *oc, void *data) rme_set_rpv); object_class_property_set_description(oc, "personalization-value", "Realm personalization value (512-bit hexadecimal number)"); + +/* + * This is not ideal. Normally SVE parameters are given to -cpu, but the + * realm parameters are needed much earlier than CPU initialization. We also + * don't have a way to discover what is supported at the moment, the idea is + * that the user knows exactly what hardware it is running on because these + * parameters are part of the measurement and play in the attestation. + */ +object_class_property_add(oc, "sve-vector-length", "uint32", rme_get_uint32, + rme_set_uint32, NULL, NULL); +object_class_property_set_description(oc, "sve-vector-length", +"SVE vector length. 0 disables SVE (the default)"); } static const TypeInfo rme_guest_info = { -- 2.39.0
[RFC PATCH 09/16] hw/arm/boot: Populate realm memory with boot images
When using the Arm RME, register the images to be loaded into Realm memory at boot. Two operations are needed for each image: (1) INIT_IPA_REALM: mark the load addresses (IPA) as RAM with RMI_RTT_INIT_RIPAS. (2) POPULATE_REALM: move the page into the Realm with RMI_DATA_CREATE. Its content contributes to the initial measurement. The reason we separate (1) from (2) is that we may need to declare more RAM than the image size. In particular booting arm64 Linux requires reserving additional BSS space after the loaded image. We could declare the whole guest RAM with INIT_IPA_REALM, though that might be wasteful in terms of stage-2 mappings if the guest is not going to use all that RAM. Signed-off-by: Jean-Philippe Brucker --- hw/arm/boot.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 15c2bf1867..115d3f5dcc 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -25,6 +25,7 @@ #include "qemu/config-file.h" #include "qemu/option.h" #include "qemu/units.h" +#include "kvm_arm.h" /* Kernel boot protocol is specified in the kernel docs * Documentation/arm/Booting and Documentation/arm64/booting.txt @@ -192,9 +193,11 @@ static void write_bootloader(const char *name, hwaddr addr, code[i] = tswap32(insn); } -assert((len * sizeof(uint32_t)) < BOOTLOADER_MAX_SIZE); +len *= sizeof(uint32_t); +assert(len < BOOTLOADER_MAX_SIZE); -rom_add_blob_fixed_as(name, code, len * sizeof(uint32_t), addr, as); +rom_add_blob_fixed_as(name, code, len, addr, as); +kvm_arm_rme_add_blob(addr, len, len); g_free(code); } @@ -683,6 +686,7 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, * the DTB is copied again upon reset, even if addr points into RAM. */ rom_add_blob_fixed_as("dtb", fdt, size, addr, as); +kvm_arm_rme_add_blob(addr, size, size); qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, rom_ptr_for_as(as, addr, size)); @@ -964,6 +968,7 @@ static uint64_t load_aarch64_image(const char *filename, hwaddr mem_base, *entry = mem_base + kernel_load_offset; rom_add_blob_fixed_as(filename, buffer, size, *entry, as); +kvm_arm_rme_add_blob(*entry, size, kernel_size); g_free(buffer); @@ -1119,6 +1124,7 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, initrd_size = 0; } info->initrd_size = initrd_size; +kvm_arm_rme_add_blob(info->initrd_start, initrd_size, initrd_size); fixupcontext[FIXUP_BOARDID] = info->board_id; fixupcontext[FIXUP_BOARD_SETUP] = info->board_setup_addr; -- 2.39.0
[RFC PATCH 02/16] target/arm: Add confidential guest support
Add a new RmeGuest object, inheriting from ConfidentialGuestSupport, to support the Arm Realm Management Extension (RME). It is instantiated by passing on the command-line: -M virt,confidential-guest-support= -object guest-rme,id=[,options...] This is only the skeleton. Support will be added in following patches. Signed-off-by: Jean-Philippe Brucker --- docs/system/confidential-guest-support.rst | 1 + qapi/qom.json | 3 +- target/arm/kvm-rme.c | 48 ++ target/arm/meson.build | 7 +++- 4 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 target/arm/kvm-rme.c diff --git a/docs/system/confidential-guest-support.rst b/docs/system/confidential-guest-support.rst index 0c490dbda2..acf46d8856 100644 --- a/docs/system/confidential-guest-support.rst +++ b/docs/system/confidential-guest-support.rst @@ -40,5 +40,6 @@ Currently supported confidential guest mechanisms are: * AMD Secure Encrypted Virtualization (SEV) (see :doc:`i386/amd-memory-encryption`) * POWER Protected Execution Facility (PEF) (see :ref:`power-papr-protected-execution-facility-pef`) * s390x Protected Virtualization (PV) (see :doc:`s390x/protvirt`) +* Arm Realm Management Extension (RME) Other mechanisms may be supported in future. diff --git a/qapi/qom.json b/qapi/qom.json index 30e76653ad..7ca27bb86c 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -917,7 +917,8 @@ 'tls-creds-x509', 'tls-cipher-suites', { 'name': 'x-remote-object', 'features': [ 'unstable' ] }, -{ 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] } +{ 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] }, +'rme-guest' ] } ## diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c new file mode 100644 index 00..22aa3dc712 --- /dev/null +++ b/target/arm/kvm-rme.c @@ -0,0 +1,48 @@ +/* + * QEMU Arm RME support + * + * Copyright Linaro 2022 + */ + +#include "qemu/osdep.h" + +#include "exec/confidential-guest-support.h" +#include "hw/boards.h" +#include "hw/core/cpu.h" +#include "kvm_arm.h" +#include "migration/blocker.h" +#include "qapi/error.h" +#include "qom/object_interfaces.h" +#include "sysemu/kvm.h" +#include "sysemu/runstate.h" + +#define TYPE_RME_GUEST "rme-guest" +OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) + +typedef struct RmeGuest RmeGuest; + +struct RmeGuest { +ConfidentialGuestSupport parent_obj; +}; + +static void rme_guest_class_init(ObjectClass *oc, void *data) +{ +} + +static const TypeInfo rme_guest_info = { +.parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, +.name = TYPE_RME_GUEST, +.instance_size = sizeof(struct RmeGuest), +.class_init = rme_guest_class_init, +.interfaces = (InterfaceInfo[]) { +{ TYPE_USER_CREATABLE }, +{ } +} +}; + +static void rme_register_types(void) +{ +type_register_static(&rme_guest_info); +} + +type_init(rme_register_types); diff --git a/target/arm/meson.build b/target/arm/meson.build index 87e911b27f..a2224c0d23 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -40,7 +40,12 @@ arm_ss.add(files( )) arm_ss.add(zlib) -arm_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c', 'kvm64.c'), if_false: files('kvm-stub.c')) +arm_ss.add(when: 'CONFIG_KVM', + if_true: files( +'kvm.c', +'kvm64.c', +'kvm-rme.c'), + if_false: files('kvm-stub.c')) arm_ss.add(when: 'TARGET_AARCH64', if_true: files( 'cpu64.c', -- 2.39.0
[RFC PATCH 16/16] target/arm/kvm-rme: Disable readonly mappings
KVM does not support creating read-only mappings for realms at the moment. Add an arch helper to detect whether read-only mappings are supported. Device ROM and flash normally use read-only mappings. Device ROM seems limited to legacy use and does not need to be trusted by the guest, so trapping reads should be fine. Flash on the other hand, is used for the firmware and needs to be both executable and measured. It may be necessary to replace flash with RAM in order to run firmwares like edk2 in realms. Signed-off-by: Jean-Philippe Brucker --- include/sysemu/kvm.h | 2 ++ accel/kvm/kvm-all.c | 8 +++- target/arm/kvm-rme.c | 9 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index e9a97eda8c..8d467c76c6 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -581,5 +581,7 @@ bool kvm_arch_cpu_check_are_resettable(void); bool kvm_dirty_ring_enabled(void); +bool kvm_arch_readonly_mem_allowed(KVMState *s); + uint32_t kvm_dirty_ring_size(void); #endif diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index f99b0becd8..56cdd2e9e9 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2267,6 +2267,11 @@ bool kvm_dirty_ring_enabled(void) return kvm_state->kvm_dirty_ring_size ? true : false; } +bool __attribute__((weak)) kvm_arch_readonly_mem_allowed(KVMState *s) +{ +return true; +} + static void query_stats_cb(StatsResultList **result, StatsTarget target, strList *names, strList *targets, Error **errp); static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp); @@ -2520,7 +2525,8 @@ static int kvm_init(MachineState *ms) } kvm_readonly_mem_allowed = -(kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); +(kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0) && +kvm_arch_readonly_mem_allowed(s); kvm_eventfds_allowed = (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 1baed79d46..2812a52aeb 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -62,6 +62,15 @@ bool kvm_arm_rme_enabled(void) return !!cgs_to_rme(cgs); } +/* + * KVM does not support creating read-only stage-2 mappings for realms at the + * moment + */ +bool kvm_arch_readonly_mem_allowed(KVMState *s) +{ +return !kvm_arm_rme_enabled(); +} + static int rme_create_rd(RmeGuest *guest, Error **errp) { int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, -- 2.39.0
[RFC PATCH 03/16] target/arm/kvm-rme: Initialize realm
The machine code calls kvm_arm_rme_vm_type() to get the VM flag and kvm_arm_rme_init() to issue KVM hypercalls in the required order: * create the realm descriptor early, * finalize the REC (vCPU) after the registers are reset, * load images into Realm RAM (in another patch), * activate the realm at the end, at which point the realm is sealed. Signed-off-by: Jean-Philippe Brucker --- target/arm/kvm_arm.h | 14 ++ target/arm/kvm-rme.c | 101 +++ 2 files changed, 115 insertions(+) diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 99017b635c..00d3df8cac 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -369,6 +369,11 @@ void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa); int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); +int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp); +int kvm_arm_rme_vm_type(MachineState *ms); + +bool kvm_arm_rme_enabled(void); + #else /* @@ -443,6 +448,15 @@ static inline uint32_t kvm_arm_sve_get_vls(CPUState *cs) g_assert_not_reached(); } +static inline int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp) +{ +return 0; +} + +static inline int kvm_arm_rme_vm_type(MachineState *ms) +{ +return 0; +} #endif static inline const char *gic_class_name(void) diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 22aa3dc712..d7cdca1cbf 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -25,6 +25,107 @@ struct RmeGuest { ConfidentialGuestSupport parent_obj; }; +static RmeGuest *cgs_to_rme(ConfidentialGuestSupport *cgs) +{ +if (!cgs) { +return NULL; +} +return (RmeGuest *)object_dynamic_cast(OBJECT(cgs), TYPE_RME_GUEST); +} + +bool kvm_arm_rme_enabled(void) +{ +ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs; + +return !!cgs_to_rme(cgs); +} + +static int rme_create_rd(RmeGuest *guest, Error **errp) +{ +int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_CREATE_RD); + +if (ret) { +error_setg_errno(errp, -ret, "RME: failed to create Realm Descriptor"); +} +return ret; +} + +static void rme_vm_state_change(void *opaque, bool running, RunState state) +{ +int ret; +CPUState *cs; + +if (state != RUN_STATE_RUNNING) { +return; +} + +/* + * Now that do_cpu_reset() initialized the boot PC and + * kvm_cpu_synchronize_post_reset() registered it, we can finalize the REC. + */ +CPU_FOREACH(cs) { +ret = kvm_arm_vcpu_finalize(cs, KVM_ARM_VCPU_REC); +if (ret) { +error_setg_errno(&error_fatal, -ret, + "RME: failed to finalize vCPU"); +} +} + +ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_ACTIVATE_REALM); +if (ret) { +error_setg_errno(&error_fatal, -ret, "RME: failed to activate realm"); +} +} + +int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp) +{ +int ret; +static Error *rme_mig_blocker; +RmeGuest *guest = cgs_to_rme(cgs); + +if (!guest) { +/* Either no cgs, or another confidential guest type */ +return 0; +} + +if (!kvm_enabled()) { +error_setg(errp, "KVM required for RME"); +return -ENODEV; +} + +if (!kvm_check_extension(kvm_state, KVM_CAP_ARM_RME)) { +error_setg(errp, "KVM does not support RME"); +return -ENODEV; +} + +ret = rme_create_rd(guest, errp); +if (ret) { +return ret; +} + +error_setg(&rme_mig_blocker, "RME: migration is not implemented"); +migrate_add_blocker(rme_mig_blocker, &error_fatal); + +/* + * The realm activation is done last, when the VM starts, after all images + * have been loaded and all vcpus finalized. + */ +qemu_add_vm_change_state_handler(rme_vm_state_change, guest); + +cgs->ready = true; +return 0; +} + +int kvm_arm_rme_vm_type(MachineState *ms) +{ +if (cgs_to_rme(ms->cgs)) { +return KVM_VM_TYPE_ARM_REALM; +} +return 0; +} + static void rme_guest_class_init(ObjectClass *oc, void *data) { } -- 2.39.0
[RFC PATCH 05/16] target/arm/kvm: Split kvm_arch_get/put_registers
The confidential guest support in KVM limits the number of registers that we can read and write. Split the get/put_registers function to prepare for it. Signed-off-by: Jean-Philippe Brucker --- target/arm/kvm64.c | 30 -- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 1197253d12..55191496f3 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -1080,7 +1080,7 @@ static int kvm_arch_put_sve(CPUState *cs) return 0; } -int kvm_arch_put_registers(CPUState *cs, int level) +static int kvm_arm_put_core_regs(CPUState *cs) { struct kvm_one_reg reg; uint64_t val; @@ -1200,6 +1200,19 @@ int kvm_arch_put_registers(CPUState *cs, int level) return ret; } +return 0; +} + +int kvm_arch_put_registers(CPUState *cs, int level) +{ +int ret; +ARMCPU *cpu = ARM_CPU(cs); + +ret = kvm_arm_put_core_regs(cs); +if (ret) { +return ret; +} + write_cpustate_to_list(cpu, true); if (!write_list_to_kvmstate(cpu, level)) { @@ -1293,7 +1306,7 @@ static int kvm_arch_get_sve(CPUState *cs) return 0; } -int kvm_arch_get_registers(CPUState *cs) +static int kvm_arm_get_core_regs(CPUState *cs) { struct kvm_one_reg reg; uint64_t val; @@ -1413,6 +1426,19 @@ int kvm_arch_get_registers(CPUState *cs) } vfp_set_fpcr(env, fpr); +return 0; +} + +int kvm_arch_get_registers(CPUState *cs) +{ +int ret; +ARMCPU *cpu = ARM_CPU(cs); + +ret = kvm_arm_get_core_regs(cs); +if (ret) { +return ret; +} + ret = kvm_get_vcpu_events(cpu); if (ret) { return ret; -- 2.39.0
[RFC PATCH 15/16] target/arm/kvm: Disable Realm reboot
A realm cannot be reset, it must be recreated from scratch. The RMM specification defines states of a Realm as NEW -> ACTIVE -> SYSTEM_OFF, after which the Realm can only be destroyed. A PCSI_SYSTEM_RESET call, which normally reboots the system, puts the Realm in SYSTEM_OFF state. QEMU does not support recreating a VM. Normally, a reboot request by the guest causes all devices to reset, which cannot work for a Realm. Indeed, loading images into Realm memory and changing the PC is only allowed for a Realm in NEW state. Resetting the images for a Realm in SYSTEM_OFF state will cause QEMU to crash with a bus error. Handle reboot requests by the guest more gracefully, by indicating to runstate.c that the vCPUs of a Realm are not resettable, and that QEMU should exit. Signed-off-by: Jean-Philippe Brucker --- target/arm/kvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index d8655d9041..02b5e8009f 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -1071,7 +1071,8 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) bool kvm_arch_cpu_check_are_resettable(void) { -return true; +/* A Realm cannot be reset */ +return !kvm_arm_rme_enabled(); } void kvm_arch_accel_class_init(ObjectClass *oc) -- 2.39.0
[RFC PATCH 04/16] hw/arm/virt: Add support for Arm RME
When confidential-guest-support is enabled for the virt machine, call the RME init function, and add the RME flag to the VM type. * The Realm differentiates non-secure from realm memory using the upper GPA bit. Reserve that bit when creating the memory map, to make sure that device MMIO located in high memory can still fit. * pvtime is disabled for the moment. Since the hypervisor has to write into the shared pvtime page before scheduling a vcpu, it seems incompatible with confidential guests. Signed-off-by: Jean-Philippe Brucker --- hw/arm/virt.c | 48 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index b871350856..df613e634a 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -210,6 +210,11 @@ static const char *valid_cpus[] = { ARM_CPU_TYPE_NAME("max"), }; +static bool virt_machine_is_confidential(VirtMachineState *vms) +{ +return MACHINE(vms)->cgs; +} + static bool cpu_type_valid(const char *cpu) { int i; @@ -247,6 +252,14 @@ static void create_fdt(VirtMachineState *vms) exit(1); } +/* + * Since the devicetree is included in the initial measurement, it must + * not contain random data. + */ +if (virt_machine_is_confidential(vms)) { +vms->dtb_randomness = false; +} + ms->fdt = fdt; /* Header */ @@ -1924,6 +1937,15 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) steal_time = object_property_get_bool(OBJECT(first_cpu), "kvm-steal-time", NULL); +if (virt_machine_is_confidential(vms)) { +/* + * The host cannot write into a confidential guest's memory until the + * guest shares it. Since the host writes the pvtime region before the + * guest gets a chance to set it up, disable pvtime. + */ +steal_time = false; +} + if (kvm_enabled()) { hwaddr pvtime_reg_base = vms->memmap[VIRT_PVTIME].base; hwaddr pvtime_reg_size = vms->memmap[VIRT_PVTIME].size; @@ -2053,10 +2075,11 @@ static void machvirt_init(MachineState *machine) * if the guest has EL2 then we will use SMC as the conduit, * and otherwise we will use HVC (for backwards compatibility and * because if we're using KVM then we must use HVC). + * Realm guests must also use SMC. */ if (vms->secure && firmware_loaded) { vms->psci_conduit = QEMU_PSCI_CONDUIT_DISABLED; -} else if (vms->virt) { +} else if (vms->virt || virt_machine_is_confidential(vms)) { vms->psci_conduit = QEMU_PSCI_CONDUIT_SMC; } else { vms->psci_conduit = QEMU_PSCI_CONDUIT_HVC; @@ -2102,6 +2125,8 @@ static void machvirt_init(MachineState *machine) exit(1); } +kvm_arm_rme_init(machine->cgs, &error_fatal); + create_fdt(vms); assert(possible_cpus->len == max_cpus); @@ -2854,15 +2879,26 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, static int virt_kvm_type(MachineState *ms, const char *type_str) { VirtMachineState *vms = VIRT_MACHINE(ms); +int rme_vm_type = kvm_arm_rme_vm_type(ms); int max_vm_pa_size, requested_pa_size; +int rme_reserve_bit = 0; bool fixed_ipa; -max_vm_pa_size = kvm_arm_get_max_vm_ipa_size(ms, &fixed_ipa); +if (rme_vm_type) { +/* + * With RME, the upper GPA bit differentiates Realm from NS memory. + * Reserve the upper bit to guarantee that highmem devices will fit. + */ +rme_reserve_bit = 1; +} + +max_vm_pa_size = kvm_arm_get_max_vm_ipa_size(ms, &fixed_ipa) - + rme_reserve_bit; /* we freeze the memory map to compute the highest gpa */ virt_set_memmap(vms, max_vm_pa_size); -requested_pa_size = 64 - clz64(vms->highest_gpa); +requested_pa_size = 64 - clz64(vms->highest_gpa) + rme_reserve_bit; /* * KVM requires the IPA size to be at least 32 bits. @@ -2883,7 +2919,11 @@ static int virt_kvm_type(MachineState *ms, const char *type_str) * the implicit legacy 40b IPA setting, in which case the kvm_type * must be 0. */ -return fixed_ipa ? 0 : requested_pa_size; +if (fixed_ipa) { +return 0; +} + +return requested_pa_size | rme_vm_type; } static void virt_machine_class_init(ObjectClass *oc, void *data) -- 2.39.0
[RFC PATCH 06/16] target/arm/kvm-rme: Initialize vCPU
The target code calls kvm_arm_vcpu_init() to mark the vCPU as part of a realm. RME support does not use the register lists, because the host can only set the boot PC and registers x0-x7. The rest is private to the Realm and saved/restored by the RMM. Signed-off-by: Jean-Philippe Brucker --- target/arm/cpu.h | 3 ++ target/arm/kvm_arm.h | 1 + target/arm/helper.c | 8 ++ target/arm/kvm-rme.c | 10 +++ target/arm/kvm.c | 12 target/arm/kvm64.c | 65 ++-- 6 files changed, 97 insertions(+), 2 deletions(-) diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 9aeed3c848..7d8397985f 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -937,6 +937,9 @@ struct ArchCPU { /* KVM steal time */ OnOffAuto kvm_steal_time; +/* Realm Management Extension */ +bool kvm_rme; + /* Uniprocessor system with MP extensions */ bool mp_is_up; diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 00d3df8cac..e4dc7fbb8d 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -373,6 +373,7 @@ int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp); int kvm_arm_rme_vm_type(MachineState *ms); bool kvm_arm_rme_enabled(void); +int kvm_arm_rme_vcpu_init(CPUState *cs); #else diff --git a/target/arm/helper.c b/target/arm/helper.c index d8c8223ec3..52360ae2ff 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -126,6 +126,10 @@ bool write_cpustate_to_list(ARMCPU *cpu, bool kvm_sync) int i; bool ok = true; +if (cpu->kvm_rme) { +return ok; +} + for (i = 0; i < cpu->cpreg_array_len; i++) { uint32_t regidx = kvm_to_cpreg_id(cpu->cpreg_indexes[i]); const ARMCPRegInfo *ri; @@ -171,6 +175,10 @@ bool write_list_to_cpustate(ARMCPU *cpu) int i; bool ok = true; +if (cpu->kvm_rme) { +return ok; +} + for (i = 0; i < cpu->cpreg_array_len; i++) { uint32_t regidx = kvm_to_cpreg_id(cpu->cpreg_indexes[i]); uint64_t v = cpu->cpreg_values[i]; diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index d7cdca1cbf..3833b187f9 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -118,6 +118,16 @@ int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp) return 0; } +int kvm_arm_rme_vcpu_init(CPUState *cs) +{ +ARMCPU *cpu = ARM_CPU(cs); + +if (kvm_arm_rme_enabled()) { +cpu->kvm_rme = true; +} +return 0; +} + int kvm_arm_rme_vm_type(MachineState *ms) { if (cgs_to_rme(ms->cgs)) { diff --git a/target/arm/kvm.c b/target/arm/kvm.c index f022c644d2..fcddead4fe 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -449,6 +449,10 @@ int kvm_arm_init_cpreg_list(ARMCPU *cpu) int i, ret, arraylen; CPUState *cs = CPU(cpu); +if (cpu->kvm_rme) { +return 0; +} + rl.n = 0; ret = kvm_vcpu_ioctl(cs, KVM_GET_REG_LIST, &rl); if (ret != -E2BIG) { @@ -521,6 +525,10 @@ bool write_kvmstate_to_list(ARMCPU *cpu) int i; bool ok = true; +if (cpu->kvm_rme) { +return ok; +} + for (i = 0; i < cpu->cpreg_array_len; i++) { struct kvm_one_reg r; uint64_t regidx = cpu->cpreg_indexes[i]; @@ -557,6 +565,10 @@ bool write_list_to_kvmstate(ARMCPU *cpu, int level) int i; bool ok = true; +if (cpu->kvm_rme) { +return ok; +} + for (i = 0; i < cpu->cpreg_array_len; i++) { struct kvm_one_reg r; uint64_t regidx = cpu->cpreg_indexes[i]; diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 55191496f3..b6320672b2 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -887,6 +887,11 @@ int kvm_arch_init_vcpu(CPUState *cs) return ret; } +ret = kvm_arm_rme_vcpu_init(cs); +if (ret) { +return ret; +} + if (cpu_isar_feature(aa64_sve, cpu)) { ret = kvm_arm_sve_set_vls(cs); if (ret) { @@ -1080,6 +1085,35 @@ static int kvm_arch_put_sve(CPUState *cs) return 0; } +static int kvm_arm_rme_put_core_regs(CPUState *cs, int level) +{ +int i, ret; +struct kvm_one_reg reg; +ARMCPU *cpu = ARM_CPU(cs); +CPUARMState *env = &cpu->env; + +/* + * The RME ABI only allows us to set 8 GPRs and the PC + */ +for (i = 0; i < 8; i++) { +reg.id = AARCH64_CORE_REG(regs.regs[i]); +reg.addr = (uintptr_t) &env->xregs[i]; +ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); +if (ret) { +return ret; +} +} + +reg.id = AARCH64_CORE_REG(regs.pc); +reg.addr = (uintptr_t) &env->pc; +ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); +if (ret) { +return ret; +} + +return 0; +} + static int kvm_arm_put_core_regs(CPUState *cs) { struct kvm_one_reg reg; @@ -1208,7 +1242,11 @@ int kvm_arch_put_registers(CPUState *cs, int le
[RFC PATCH 10/16] target/arm/kvm-rme: Add measurement algorithm property
This option selects which measurement algorithm to use for attestation. Supported values are sha256 and sha512. Signed-off-by: Jean-Philippe Brucker --- qapi/qom.json| 14 - target/arm/kvm-rme.c | 71 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/qapi/qom.json b/qapi/qom.json index 7ca27bb86c..87fe7c31fe 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -855,6 +855,17 @@ 'data': { '*cpu-affinity': ['uint16'], '*node-affinity': ['uint16'] } } +## +# @RmeGuestProperties: +# +# Properties for rme-guest objects. +# +# @measurement-algo: Realm measurement algorithm (default: RMM default) +# +# Since: FIXME +## +{ 'struct': 'RmeGuestProperties', + 'data': { '*measurement-algo': 'str' } } ## # @ObjectType: @@ -985,7 +996,8 @@ 'tls-creds-x509': 'TlsCredsX509Properties', 'tls-cipher-suites': 'TlsCredsProperties', 'x-remote-object':'RemoteObjectProperties', - 'x-vfio-user-server': 'VfioUserServerProperties' + 'x-vfio-user-server': 'VfioUserServerProperties', + 'rme-guest': 'RmeGuestProperties' } } ## diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index c8c019f78a..3929b941ae 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -22,10 +22,13 @@ OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) #define RME_PAGE_SIZE qemu_real_host_page_size() +#define RME_MAX_CFG 1 + typedef struct RmeGuest RmeGuest; struct RmeGuest { ConfidentialGuestSupport parent_obj; +char *measurement_algo; }; struct RmeImage { @@ -62,6 +65,40 @@ static int rme_create_rd(RmeGuest *guest, Error **errp) return ret; } +static int rme_configure_one(RmeGuest *guest, uint32_t cfg, Error **errp) +{ +int ret; +const char *cfg_str; +struct kvm_cap_arm_rme_config_item args = { +.cfg = cfg, +}; + +switch (cfg) { +case KVM_CAP_ARM_RME_CFG_HASH_ALGO: +if (!guest->measurement_algo) { +return 0; +} +if (!strcmp(guest->measurement_algo, "sha256")) { +args.hash_algo = KVM_CAP_ARM_RME_MEASUREMENT_ALGO_SHA256; +} else if (!strcmp(guest->measurement_algo, "sha512")) { +args.hash_algo = KVM_CAP_ARM_RME_MEASUREMENT_ALGO_SHA512; +} else { +g_assert_not_reached(); +} +cfg_str = "hash algorithm"; +break; +default: +g_assert_not_reached(); +} + +ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_CONFIG_REALM, (intptr_t)&args); +if (ret) { +error_setg_errno(errp, -ret, "RME: failed to configure %s", cfg_str); +} +return ret; +} + static void rme_populate_realm(gpointer data, gpointer user_data) { int ret; @@ -128,6 +165,7 @@ static void rme_vm_state_change(void *opaque, bool running, RunState state) int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp) { int ret; +int cfg; static Error *rme_mig_blocker; RmeGuest *guest = cgs_to_rme(cgs); @@ -146,6 +184,13 @@ int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp) return -ENODEV; } +for (cfg = 0; cfg < RME_MAX_CFG; cfg++) { +ret = rme_configure_one(guest, cfg, errp); +if (ret) { +return ret; +} +} + ret = rme_create_rd(guest, errp); if (ret) { return ret; @@ -215,8 +260,34 @@ int kvm_arm_rme_vm_type(MachineState *ms) return 0; } +static char *rme_get_measurement_algo(Object *obj, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); + +return g_strdup(guest->measurement_algo); +} + +static void rme_set_measurement_algo(Object *obj, const char *value, + Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); + +if (strncmp(value, "sha256", 6) && +strncmp(value, "sha512", 6)) { +error_setg(errp, "invalid Realm measurement algorithm '%s'", value); +return; +} +g_free(guest->measurement_algo); +guest->measurement_algo = g_strdup(value); +} + static void rme_guest_class_init(ObjectClass *oc, void *data) { +object_class_property_add_str(oc, "measurement-algo", + rme_get_measurement_algo, + rme_set_measurement_algo); +object_class_property_set_description(oc, "measurement-algo", +"Realm measurement algorithm ('sha256', 'sha512')"); } static const TypeInfo rme_guest_info = { -- 2.39.0
[RFC PATCH 01/16] NOMERGE: Add KVM Arm RME definitions to Linux headers
Copy the KVM definitions for Arm RME from the development branch. Don't merge, they will be added from the periodic Linux header sync. Signed-off-by: Jean-Philippe Brucker --- linux-headers/asm-arm64/kvm.h | 63 +++ linux-headers/linux/kvm.h | 21 +--- 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index 4bf2d7246e..8e04d6f7ff 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -108,6 +108,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_SVE 4 /* enable SVE for this CPU */ #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ +#define KVM_ARM_VCPU_REC 7 /* VCPU REC state as part of Realm */ struct kvm_vcpu_init { __u32 target; @@ -391,6 +392,68 @@ enum { #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 #define KVM_DEV_ARM_ITS_CTRL_RESET 4 +/* KVM_CAP_ARM_RME on VM fd */ +#define KVM_CAP_ARM_RME_CONFIG_REALM 0 +#define KVM_CAP_ARM_RME_CREATE_RD 1 +#define KVM_CAP_ARM_RME_INIT_IPA_REALM 2 +#define KVM_CAP_ARM_RME_POPULATE_REALM 3 +#define KVM_CAP_ARM_RME_ACTIVATE_REALM 4 + +#define KVM_CAP_ARM_RME_MEASUREMENT_ALGO_SHA2560 +#define KVM_CAP_ARM_RME_MEASUREMENT_ALGO_SHA5121 + +#define KVM_CAP_ARM_RME_RPV_SIZE 64 + +/* List of configuration items accepted for KVM_CAP_ARM_RME_CONFIG_REALM */ +#define KVM_CAP_ARM_RME_CFG_RPV0 +#define KVM_CAP_ARM_RME_CFG_HASH_ALGO 1 +#define KVM_CAP_ARM_RME_CFG_SVE2 +#define KVM_CAP_ARM_RME_CFG_DBG3 +#define KVM_CAP_ARM_RME_CFG_PMU4 + +struct kvm_cap_arm_rme_config_item { + __u32 cfg; + union { + /* cfg == KVM_CAP_ARM_RME_CFG_RPV */ + struct { + __u8rpv[KVM_CAP_ARM_RME_RPV_SIZE]; + }; + + /* cfg == KVM_CAP_ARM_RME_CFG_HASH_ALGO */ + struct { + __u32 hash_algo; + }; + + /* cfg == KVM_CAP_ARM_RME_CFG_SVE */ + struct { + __u32 sve_vq; + }; + + /* cfg == KVM_CAP_ARM_RME_CFG_DBG */ + struct { + __u32 num_brps; + __u32 num_wrps; + }; + + /* cfg == KVM_CAP_ARM_RME_CFG_PMU */ + struct { + __u32 num_pmu_cntrs; + }; + /* Fix the size of the union */ + __u8reserved[256]; + }; +}; + +struct kvm_cap_arm_rme_populate_realm_args { + __u64 populate_ipa_base; + __u64 populate_ipa_size; +}; + +struct kvm_cap_arm_rme_init_ipa_args { + __u64 init_ipa_base; + __u64 init_ipa_size; +}; + /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index ebdafa576d..9d5affc98a 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -901,14 +901,25 @@ struct kvm_ppc_resize_hpt { #define KVM_S390_SIE_PAGE_OFFSET 1 /* - * On arm64, machine type can be used to request the physical - * address size for the VM. Bits[7-0] are reserved for the guest - * PA size shift (i.e, log2(PA_Size)). For backward compatibility, - * value 0 implies the default IPA size, 40bits. + * On arm64, machine type can be used to request both the machine type and + * the physical address size for the VM. + * + * Bits[11-8] are reserved for the ARM specific machine type. + * + * Bits[7-0] are reserved for the guest PA size shift (i.e, log2(PA_Size)). + * For backward compatibility, value 0 implies the default IPA size, 40bits. */ +#define KVM_VM_TYPE_ARM_SHIFT 8 +#define KVM_VM_TYPE_ARM_MASK (0xfULL << KVM_VM_TYPE_ARM_SHIFT) +#define KVM_VM_TYPE_ARM(_type) \ + (((_type) << KVM_VM_TYPE_ARM_SHIFT) & KVM_VM_TYPE_ARM_MASK) +#define KVM_VM_TYPE_ARM_NORMAL KVM_VM_TYPE_ARM(0) +#define KVM_VM_TYPE_ARM_REALM KVM_VM_TYPE_ARM(1) + #define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL #define KVM_VM_TYPE_ARM_IPA_SIZE(x)\ ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + /* * ioctls for /dev/kvm fds: */ @@ -1176,6 +1187,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 +#define KVM_CAP_ARM_RME 300 // FIXME: Large number to prevent conflicts + #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing_irqchip { -- 2.39.0
[RFC PATCH 11/16] target/arm/kvm-rme: Add Realm Personalization Value parameter
The Realm Personalization Value (RPV) is provided by the user to distinguish Realms that have the same initial measurement. The user provides a 512-bit hexadecimal number. Signed-off-by: Jean-Philippe Brucker --- qapi/qom.json| 5 ++- target/arm/kvm-rme.c | 72 +++- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/qapi/qom.json b/qapi/qom.json index 87fe7c31fe..a012281628 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -862,10 +862,13 @@ # # @measurement-algo: Realm measurement algorithm (default: RMM default) # +# @personalization-value: Realm personalization value (default: 0) +# # Since: FIXME ## { 'struct': 'RmeGuestProperties', - 'data': { '*measurement-algo': 'str' } } + 'data': { '*measurement-algo': 'str', +'*personalization-value': 'str' } } ## # @ObjectType: diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 3929b941ae..e974c27e5c 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -22,13 +22,14 @@ OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) #define RME_PAGE_SIZE qemu_real_host_page_size() -#define RME_MAX_CFG 1 +#define RME_MAX_CFG 2 typedef struct RmeGuest RmeGuest; struct RmeGuest { ConfidentialGuestSupport parent_obj; char *measurement_algo; +char *personalization_value; }; struct RmeImage { @@ -65,6 +66,45 @@ static int rme_create_rd(RmeGuest *guest, Error **errp) return ret; } +static int rme_parse_rpv(uint8_t *out, const char *in, Error **errp) +{ +int ret; +size_t in_len = strlen(in); + +/* Two chars per byte */ +if (in_len > KVM_CAP_ARM_RME_RPV_SIZE * 2) { +error_setg(errp, "Realm Personalization Value is too large"); +return -E2BIG; +} + +/* + * Parse as big-endian hexadecimal number (most significant byte on the + * left), store little-endian, zero-padded on the right. + */ +while (in_len) { +/* + * Do the lower nibble first to catch invalid inputs such as '2z', and + * to handle the last char. + */ +in_len--; +ret = sscanf(in + in_len, "%1hhx", out); +if (ret != 1) { +error_setg(errp, "Invalid Realm Personalization Value"); +return -EINVAL; +} +if (!in_len) { +break; +} +in_len--; +ret = sscanf(in + in_len, "%2hhx", out++); +if (ret != 1) { +error_setg(errp, "Invalid Realm Personalization Value"); +return -EINVAL; +} +} +return 0; +} + static int rme_configure_one(RmeGuest *guest, uint32_t cfg, Error **errp) { int ret; @@ -87,6 +127,16 @@ static int rme_configure_one(RmeGuest *guest, uint32_t cfg, Error **errp) } cfg_str = "hash algorithm"; break; +case KVM_CAP_ARM_RME_CFG_RPV: +if (!guest->personalization_value) { +return 0; +} +ret = rme_parse_rpv(args.rpv, guest->personalization_value, errp); +if (ret) { +return ret; +} +cfg_str = "personalization value"; +break; default: g_assert_not_reached(); } @@ -281,6 +331,21 @@ static void rme_set_measurement_algo(Object *obj, const char *value, guest->measurement_algo = g_strdup(value); } +static char *rme_get_rpv(Object *obj, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); + +return g_strdup(guest->personalization_value); +} + +static void rme_set_rpv(Object *obj, const char *value, Error **errp) +{ +RmeGuest *guest = RME_GUEST(obj); + +g_free(guest->personalization_value); +guest->personalization_value = g_strdup(value); +} + static void rme_guest_class_init(ObjectClass *oc, void *data) { object_class_property_add_str(oc, "measurement-algo", @@ -288,6 +353,11 @@ static void rme_guest_class_init(ObjectClass *oc, void *data) rme_set_measurement_algo); object_class_property_set_description(oc, "measurement-algo", "Realm measurement algorithm ('sha256', 'sha512')"); + +object_class_property_add_str(oc, "personalization-value", rme_get_rpv, + rme_set_rpv); +object_class_property_set_description(oc, "personalization-value", +"Realm personalization value (512-bit hexadecimal number)"); } static const TypeInfo rme_guest_info = { -- 2.39.0
[RFC PATCH 08/16] target/arm/kvm-rme: Populate the realm with boot images
Initialize the GPA space and populate it with boot images (kernel, initrd, firmware, etc). Populating has to be done at VM start time, because the images are loaded during reset by rom_reset() Signed-off-by: Jean-Philippe Brucker --- target/arm/kvm_arm.h | 6 target/arm/kvm-rme.c | 79 2 files changed, 85 insertions(+) diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index e4dc7fbb8d..cec6500603 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -371,6 +371,7 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp); int kvm_arm_rme_vm_type(MachineState *ms); +void kvm_arm_rme_add_blob(hwaddr start, hwaddr src_size, hwaddr dst_size); bool kvm_arm_rme_enabled(void); int kvm_arm_rme_vcpu_init(CPUState *cs); @@ -458,6 +459,11 @@ static inline int kvm_arm_rme_vm_type(MachineState *ms) { return 0; } + +static inline void kvm_arm_rme_add_blob(hwaddr start, hwaddr src_size, +hwaddr dst_size) +{ +} #endif static inline const char *gic_class_name(void) diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 3833b187f9..c8c019f78a 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -9,6 +9,7 @@ #include "exec/confidential-guest-support.h" #include "hw/boards.h" #include "hw/core/cpu.h" +#include "hw/loader.h" #include "kvm_arm.h" #include "migration/blocker.h" #include "qapi/error.h" @@ -19,12 +20,22 @@ #define TYPE_RME_GUEST "rme-guest" OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) +#define RME_PAGE_SIZE qemu_real_host_page_size() + typedef struct RmeGuest RmeGuest; struct RmeGuest { ConfidentialGuestSupport parent_obj; }; +struct RmeImage { +hwaddr base; +hwaddr src_size; +hwaddr dst_size; +}; + +static GSList *rme_images; + static RmeGuest *cgs_to_rme(ConfidentialGuestSupport *cgs) { if (!cgs) { @@ -51,6 +62,38 @@ static int rme_create_rd(RmeGuest *guest, Error **errp) return ret; } +static void rme_populate_realm(gpointer data, gpointer user_data) +{ +int ret; +struct RmeImage *image = data; +struct kvm_cap_arm_rme_init_ipa_args init_args = { +.init_ipa_base = image->base, +.init_ipa_size = image->dst_size, +}; +struct kvm_cap_arm_rme_populate_realm_args populate_args = { +.populate_ipa_base = image->base, +.populate_ipa_size = image->src_size, +}; + +ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_INIT_IPA_REALM, +(intptr_t)&init_args); +if (ret) { +error_setg_errno(&error_fatal, -ret, + "RME: failed to initialize GPA range (0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx")", + image->base, image->dst_size); +} + +ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, +KVM_CAP_ARM_RME_POPULATE_REALM, +(intptr_t)&populate_args); +if (ret) { +error_setg_errno(&error_fatal, -ret, + "RME: failed to populate realm (0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx")", + image->base, image->src_size); +} +} + static void rme_vm_state_change(void *opaque, bool running, RunState state) { int ret; @@ -72,6 +115,9 @@ static void rme_vm_state_change(void *opaque, bool running, RunState state) } } +g_slist_foreach(rme_images, rme_populate_realm, NULL); +g_slist_free_full(g_steal_pointer(&rme_images), g_free); + ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_RME, 0, KVM_CAP_ARM_RME_ACTIVATE_REALM); if (ret) { @@ -118,6 +164,39 @@ int kvm_arm_rme_init(ConfidentialGuestSupport *cgs, Error **errp) return 0; } +/* + * kvm_arm_rme_add_blob - Initialize the Realm IPA range and set up the image. + * + * @src_size is the number of bytes of the source image, to be populated into + * Realm memory. + * @dst_size is the effective image size, which may be larger than @src_size. + * For a kernel @dst_size may include zero-initialized regions such as the BSS + * and initial page directory. + */ +void kvm_arm_rme_add_blob(hwaddr base, hwaddr src_size, hwaddr dst_size) +{ +struct RmeImage *image; + +if (!kvm_arm_rme_enabled()) { +return; +} + +base = QEMU_ALIGN_DOWN(base, RME_PAGE_SIZE); +src_size = QEMU_ALIGN_UP(src_size, RME_PAGE_SIZE); +dst_size = QEMU_ALIGN_UP(dst_size, RME_PAGE_SIZE); + +image = g_malloc0(sizeof(*image)); +image->base = base; +image->src_size = src_size; +image->dst_size = dst_size; + +/* + * The ROM loader will onl
[RFC PATCH 14/16] target/arm/kvm-rme: Add PMU num counters parameters
Pass the num_cntrs parameter to Realm creation. These parameters contribute to the initial Realm measurement. Signed-off-by: Jean-Philippe Brucker --- qapi/qom.json| 5 - target/arm/kvm-rme.c | 21 - 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/qapi/qom.json b/qapi/qom.json index 86ed386f26..13c85abde9 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -870,6 +870,8 @@ # # @num-watchpoints: Number of watchpoints (default: 0) # +# @num-pmu-counters: Number of PMU counters (default: 0, PMU disabled) +# # Since: FIXME ## { 'struct': 'RmeGuestProperties', @@ -877,7 +879,8 @@ '*personalization-value': 'str', '*sve-vector-length': 'uint32', '*num-breakpoints': 'uint32', -'*num-watchpoints': 'uint32' } } +'*num-watchpoints': 'uint32', +'*num-pmu-counters': 'uint32' } } ## # @ObjectType: diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 3f39f1f7ad..1baed79d46 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -24,7 +24,8 @@ OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) #define RME_MAX_BPS 0x10 #define RME_MAX_WPS 0x10 -#define RME_MAX_CFG 4 +#define RME_MAX_PMU_CTRS0x20 +#define RME_MAX_CFG 5 typedef struct RmeGuest RmeGuest; @@ -35,6 +36,7 @@ struct RmeGuest { uint32_t sve_vl; uint32_t num_wps; uint32_t num_bps; +uint32_t num_pmu_cntrs; }; struct RmeImage { @@ -157,6 +159,13 @@ static int rme_configure_one(RmeGuest *guest, uint32_t cfg, Error **errp) args.num_wrps = guest->num_wps; cfg_str = "debug parameters"; break; +case KVM_CAP_ARM_RME_CFG_PMU: +if (!guest->num_pmu_cntrs) { +return 0; +} +args.num_pmu_cntrs = guest->num_pmu_cntrs; +cfg_str = "PMU"; +break; default: g_assert_not_reached(); } @@ -378,6 +387,8 @@ static void rme_get_uint32(Object *obj, Visitor *v, const char *name, value = guest->num_bps; } else if (strcmp(name, "num-watchpoints") == 0) { value = guest->num_wps; +} else if (strcmp(name, "num-pmu-counters") == 0) { +value = guest->num_pmu_cntrs; } else { g_assert_not_reached(); } @@ -410,6 +421,9 @@ static void rme_set_uint32(Object *obj, Visitor *v, const char *name, } else if (strcmp(name, "num-watchpoints") == 0) { max_value = RME_MAX_WPS; var = &guest->num_wps; +} else if (strcmp(name, "num-pmu-counters") == 0) { +max_value = RME_MAX_PMU_CTRS; +var = &guest->num_pmu_cntrs; } else { g_assert_not_reached(); } @@ -456,6 +470,11 @@ static void rme_guest_class_init(ObjectClass *oc, void *data) rme_set_uint32, NULL, NULL); object_class_property_set_description(oc, "num-watchpoints", "Number of watchpoints"); + +object_class_property_add(oc, "num-pmu-counters", "uint32", rme_get_uint32, + rme_set_uint32, NULL, NULL); +object_class_property_set_description(oc, "num-pmu-counters", +"Number of PMU counters"); } static const TypeInfo rme_guest_info = { -- 2.39.0
[RFC PATCH 13/16] target/arm/kvm-rme: Add breakpoints and watchpoints parameters
Pass the num_bps and num_wps parameters to Realm creation. These parameters contribute to the initial Realm measurement. Signed-off-by: Jean-Philippe Brucker --- qapi/qom.json| 8 +++- target/arm/kvm-rme.c | 34 +- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/qapi/qom.json b/qapi/qom.json index 94ecb87f6f..86ed386f26 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -866,12 +866,18 @@ # # @sve-vector-length: SVE vector length (default: 0, SVE disabled) # +# @num-breakpoints: Number of breakpoints (default: 0) +# +# @num-watchpoints: Number of watchpoints (default: 0) +# # Since: FIXME ## { 'struct': 'RmeGuestProperties', 'data': { '*measurement-algo': 'str', '*personalization-value': 'str', -'*sve-vector-length': 'uint32' } } +'*sve-vector-length': 'uint32', +'*num-breakpoints': 'uint32', +'*num-watchpoints': 'uint32' } } ## # @ObjectType: diff --git a/target/arm/kvm-rme.c b/target/arm/kvm-rme.c index 0b2153a45c..3f39f1f7ad 100644 --- a/target/arm/kvm-rme.c +++ b/target/arm/kvm-rme.c @@ -22,7 +22,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(RmeGuest, RME_GUEST) #define RME_PAGE_SIZE qemu_real_host_page_size() -#define RME_MAX_CFG 3 +#define RME_MAX_BPS 0x10 +#define RME_MAX_WPS 0x10 +#define RME_MAX_CFG 4 typedef struct RmeGuest RmeGuest; @@ -31,6 +33,8 @@ struct RmeGuest { char *measurement_algo; char *personalization_value; uint32_t sve_vl; +uint32_t num_wps; +uint32_t num_bps; }; struct RmeImage { @@ -145,6 +149,14 @@ static int rme_configure_one(RmeGuest *guest, uint32_t cfg, Error **errp) args.sve_vq = guest->sve_vl / 128; cfg_str = "SVE"; break; +case KVM_CAP_ARM_RME_CFG_DBG: +if (!guest->num_bps && !guest->num_wps) { +return 0; +} +args.num_brps = guest->num_bps; +args.num_wrps = guest->num_wps; +cfg_str = "debug parameters"; +break; default: g_assert_not_reached(); } @@ -362,6 +374,10 @@ static void rme_get_uint32(Object *obj, Visitor *v, const char *name, if (strcmp(name, "sve-vector-length") == 0) { value = guest->sve_vl; +} else if (strcmp(name, "num-breakpoints") == 0) { +value = guest->num_bps; +} else if (strcmp(name, "num-watchpoints") == 0) { +value = guest->num_wps; } else { g_assert_not_reached(); } @@ -388,6 +404,12 @@ static void rme_set_uint32(Object *obj, Visitor *v, const char *name, error_setg(errp, "invalid SVE vector length %"PRIu32, value); return; } +} else if (strcmp(name, "num-breakpoints") == 0) { +max_value = RME_MAX_BPS; +var = &guest->num_bps; +} else if (strcmp(name, "num-watchpoints") == 0) { +max_value = RME_MAX_WPS; +var = &guest->num_wps; } else { g_assert_not_reached(); } @@ -424,6 +446,16 @@ static void rme_guest_class_init(ObjectClass *oc, void *data) rme_set_uint32, NULL, NULL); object_class_property_set_description(oc, "sve-vector-length", "SVE vector length. 0 disables SVE (the default)"); + +object_class_property_add(oc, "num-breakpoints", "uint32", rme_get_uint32, + rme_set_uint32, NULL, NULL); +object_class_property_set_description(oc, "num-breakpoints", +"Number of breakpoints"); + +object_class_property_add(oc, "num-watchpoints", "uint32", rme_get_uint32, + rme_set_uint32, NULL, NULL); +object_class_property_set_description(oc, "num-watchpoints", +"Number of watchpoints"); } static const TypeInfo rme_guest_info = { -- 2.39.0
Re: [PATCH 1/2] hw/arm/virt: Rename default_bus_bypass_iommu
Hi Markus, On Thu, Nov 25, 2021 at 08:11:04AM +0100, Markus Armbruster wrote: > Peter, this patch fixes a bug that became a regression when the fix > missed 6.1. It's been stuck on the list since August. Please have a > look, and if it's good, get it merged. I'll ask the i386/pc maintainers > to do the same for PATCH 2. Both fixes have been merged in v6.2 (9dad363a223d and 739b38630c45) Thanks, Jean > > Jean-Philippe Brucker writes: > > > Since commit d8fb7d0969d5 ("vl: switch -M parsing to keyval"), machine > > parameter definitions cannot use underscores, because keyval_dashify() > > transforms them to dashes and the parser doesn't find the parameter. > > > > This affects option default_bus_bypass_iommu which was introduced in the > > same release: > > > > $ qemu-system-aarch64 -M virt,default_bus_bypass_iommu=on > > qemu-system-aarch64: Property 'virt-6.1-machine.default-bus-bypass-iommu' > > not found > > > > Rename the parameter to "default-bus-bypass-iommu". Passing > > "default_bus_bypass_iommu" is still valid since the underscore are > > transformed automatically. > > > > Fixes: 6d7a85483a06 ("hw/arm/virt: Add default_bus_bypass_iommu machine > > option") > > Signed-off-by: Jean-Philippe Brucker > > --- > > hw/arm/virt.c | 4 ++-- > > 1 file changed, 2 insertions(+), 2 deletions(-) > > > > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > > index b4598d3fe6..7075cdc15e 100644 > > --- a/hw/arm/virt.c > > +++ b/hw/arm/virt.c > > @@ -2671,10 +2671,10 @@ static void virt_machine_class_init(ObjectClass > > *oc, void *data) > >"Set the IOMMU type. " > >"Valid values are none and > > smmuv3"); > > > > -object_class_property_add_bool(oc, "default_bus_bypass_iommu", > > +object_class_property_add_bool(oc, "default-bus-bypass-iommu", > > virt_get_default_bus_bypass_iommu, > > virt_set_default_bus_bypass_iommu); > > -object_class_property_set_description(oc, "default_bus_bypass_iommu", > > +object_class_property_set_description(oc, "default-bus-bypass-iommu", > >"Set on/off to enable/disable " > >"bypass_iommu for default root > > bus"); >
Re: [PATCH v4 00/10] Add stage-2 translation for SMMUv3
On Tue, May 16, 2023 at 08:33:07PM +, Mostafa Saleh wrote: > This patch series can be used to run Linux pKVM SMMUv3 patches (currently on > the list) > which controls stage-2 (from EL2) while providing a paravirtualized > interface the host(EL1) > https://lore.kernel.org/kvmarm/20230201125328.2186498-1-jean-phili...@linaro.org/ I've been using these patches for pKVM, and also tested the normal stage-2 flow with Linux and VFIO Tested-by: Jean-Philippe Brucker
Re: [PATCH v2] virtio-iommu: Fix the partial copy of probe request
Hi, On Wed, Jun 22, 2022 at 12:20:45PM +0200, Eric Auger wrote: > Hi, > > On 6/17/22 08:20, Zhenzhong Duan wrote: > > The structure of probe request doesn't include the tail, this leads > > to a few field missed to be copied. Currently this isn't an issue as > > those missed field belong to reserved field, just in case reserved > > field will be used in the future. > > > > Fixes: 1733eebb9e75b ("virtio-iommu: Implement RESV_MEM probe request") > > Signed-off-by: Zhenzhong Duan > nice catch. > > Reviewed-by: Eric Auger > > the spec is pretty confusing here though (virtio-v1.2-csd01.pdf) as it > presents the struct as follows: > > struct virtio_iommu_req_probe { > struct virtio_iommu_req_head head; > /* Device-readable */ > le32 endpoint; > u8 reserved[64]; > > /* Device-writable */ > u8 properties[probe_size]; > struct virtio_iommu_req_tail tail; > }; Hm, which part is confusing? Yes it's not valid C since probe_size is defined dynamically ('probe_size' in the device config), but I thought it would be nicer to show the whole request layout this way. Besides, at least virtio-blk and virtio-scsi have similar variable-sized arrays in their definitions > > Adding Jean in the loop ... > > Thanks! > > Eric > > > > > > --- > > v2: keep bugfix change and drop cleanup change > > > > hw/virtio/virtio-iommu.c | 3 ++- > > 1 file changed, 2 insertions(+), 1 deletion(-) > > > > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > > index 7c122ab95780..195f909620ab 100644 > > --- a/hw/virtio/virtio-iommu.c > > +++ b/hw/virtio/virtio-iommu.c > > @@ -708,7 +708,8 @@ static int virtio_iommu_handle_probe(VirtIOIOMMU *s, > > uint8_t *buf) > > { > > struct virtio_iommu_req_probe req; > > -int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req)); > > +int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, > > +sizeof(req) + sizeof(struct virtio_iommu_req_tail)); Not sure this is correct, because what we are doing here is reading the device-readable part of the property from the request. That part is only composed of fields 'head', 'endpoint' and 'reserved[64]' and its size is indeed sizeof(struct virtio_iommu_req_probe). The 'properties' and 'tail' fields shouldn't be read by the device here, they are instead written later. It is virtio_iommu_handle_command() that copies both of them into the request: output_size = s->config.probe_size + sizeof(tail); buf = g_malloc0(output_size); ptail = (struct virtio_iommu_req_tail *) (buf + s->config.probe_size); ptail->status = virtio_iommu_handle_probe(s, iov, iov_cnt, buf); // and virtio_iommu_probe() fills 'properties' as needed ... // then copy the lot sz = iov_from_buf(elem->in_sg, elem->in_num, 0, buf ? buf : &tail, output_size); So I think the current code is correct, as all fields are accounted for Thanks, Jean > > > > return ret ? ret : virtio_iommu_probe(s, &req, buf); > > } >
Re: [PATCH v2] virtio-iommu: Fix the partial copy of probe request
On Wed, Jun 22, 2022 at 02:22:18PM +0200, Eric Auger wrote: > >> the spec is pretty confusing here though (virtio-v1.2-csd01.pdf) as it > >> presents the struct as follows: > >> > >> struct virtio_iommu_req_probe { > >> struct virtio_iommu_req_head head; > >> /* Device-readable */ > >> le32 endpoint; > >> u8 reserved[64]; > >> > >> /* Device-writable */ > >> u8 properties[probe_size]; > >> struct virtio_iommu_req_tail tail; > >> }; > > Hm, which part is confusing? Yes it's not valid C since probe_size is > > defined dynamically ('probe_size' in the device config), but I thought it > > would be nicer to show the whole request layout this way. Besides, at > > least virtio-blk and virtio-scsi have similar variable-sized arrays in > > their definitions > the fact "struct virtio_iommu_req_tail tail;" was part of the > > virtio_iommu_req_probe struct Right, it would have been better to use a different name than virtio_iommu_req_probe in virtio_iommu.h, to make the pitfall clear. The larger problem is using C structs across the virtio spec instead of an abstract format. Someone implementing the device in another language would already not encounter this problem since they would read the spec as an abstract format. For documentation purposes I do prefer displaying the whole struct like this rather than working around limitations of C, which may be more confusing. > >>> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > >>> index 7c122ab95780..195f909620ab 100644 > >>> --- a/hw/virtio/virtio-iommu.c > >>> +++ b/hw/virtio/virtio-iommu.c > >>> @@ -708,7 +708,8 @@ static int virtio_iommu_handle_probe(VirtIOIOMMU *s, > >>> uint8_t *buf) > >>> { > >>> struct virtio_iommu_req_probe req; > >>> -int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req)); > >>> +int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, > >>> +sizeof(req) + sizeof(struct virtio_iommu_req_tail)); > > Not sure this is correct, because what we are doing here is reading the > > device-readable part of the property from the request. That part is only > > composed of fields 'head', 'endpoint' and 'reserved[64]' and its size is > > indeed sizeof(struct virtio_iommu_req_probe). > > > > The 'properties' and 'tail' fields shouldn't be read by the device here, > > they are instead written later. It is virtio_iommu_handle_command() that > > copies both of them into the request: > > > > output_size = s->config.probe_size + sizeof(tail); > > buf = g_malloc0(output_size); > > > > ptail = (struct virtio_iommu_req_tail *) > > (buf + s->config.probe_size); > > ptail->status = virtio_iommu_handle_probe(s, iov, iov_cnt, buf); > > // and virtio_iommu_probe() fills 'properties' as needed > > ... > > > > // then copy the lot > > sz = iov_from_buf(elem->in_sg, elem->in_num, 0, > > buf ? buf : &tail, output_size); > > > > So I think the current code is correct, as all fields are accounted for > > In virtio_iommu_iov_to_req(), payload_sz is computed as > > payload_sz = req_sz - sizeof(struct virtio_iommu_req_tail); > > sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz); > > This works for other command structs but not for probe one. Aah right sorry. The resulting code may be less confusing if we moved "- sizeof(struct virtio_iommu_req_tail)" to virtio_iommu_handle_req() Thanks, Jean
Re: [PATCH v2] virtio-iommu: Fix the partial copy of probe request
On Thu, Jun 23, 2022 at 01:40:58AM +, Duan, Zhenzhong wrote: > > > >-Original Message----- > >From: Jean-Philippe Brucker > >Sent: Wednesday, June 22, 2022 9:58 PM > >To: Eric Auger > >Cc: Duan, Zhenzhong ; qemu- > >de...@nongnu.org; m...@redhat.com > >Subject: Re: [PATCH v2] virtio-iommu: Fix the partial copy of probe request > > > >On Wed, Jun 22, 2022 at 02:22:18PM +0200, Eric Auger wrote: > >> >> the spec is pretty confusing here though (virtio-v1.2-csd01.pdf) as > >> >> it presents the struct as follows: > >> >> > >> >> struct virtio_iommu_req_probe { > >> >> struct virtio_iommu_req_head head; > >> >> /* Device-readable */ > >> >> le32 endpoint; > >> >> u8 reserved[64]; > >> >> > >> >> /* Device-writable */ > >> >> u8 properties[probe_size]; > >> >> struct virtio_iommu_req_tail tail; > >> >> }; > >> > Hm, which part is confusing? Yes it's not valid C since probe_size > >> > is defined dynamically ('probe_size' in the device config), but I > >> > thought it would be nicer to show the whole request layout this way. > >> > Besides, at least virtio-blk and virtio-scsi have similar > >> > variable-sized arrays in their definitions > >> the fact "struct virtio_iommu_req_tail tail;" was part of the > >> > >> virtio_iommu_req_probe struct > > > >Right, it would have been better to use a different name than > >virtio_iommu_req_probe in virtio_iommu.h, to make the pitfall clear. > > > Maybe virtio_iommu_req_probe_no_tail? Yes, we can't change the probe struct anymore since it's API, but we could use the no_tail prefix on future structs Thanks, Jean
Re: [PATCH v3] virtio-iommu: Fix the partial copy of probe request
On Thu, Jun 23, 2022 at 10:31:52AM +0800, Zhenzhong Duan wrote: > The structure of probe request doesn't include the tail, this leads > to a few field missed to be copied. Currently this isn't an issue as > those missed field belong to reserved field, just in case reserved > field will be used in the future. > > Changed 4th parameter of virtio_iommu_iov_to_req() to receive size > of device-readable part. > > Fixes: 1733eebb9e75b ("virtio-iommu: Implement RESV_MEM probe request") > Signed-off-by: Zhenzhong Duan Reviewed-by: Jean-Philippe Brucker > --- > v3: moved "- sizeof(struct virtio_iommu_req_tail)" to > virtio_iommu_handle_req() per Jean > v2: keep bugfix change and drop cleanup change > > hw/virtio/virtio-iommu.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > index 7c122ab95780..08b227e828f8 100644 > --- a/hw/virtio/virtio-iommu.c > +++ b/hw/virtio/virtio-iommu.c > @@ -675,11 +675,10 @@ static int virtio_iommu_probe(VirtIOIOMMU *s, > > static int virtio_iommu_iov_to_req(struct iovec *iov, > unsigned int iov_cnt, > - void *req, size_t req_sz) > + void *req, size_t payload_sz) > { > -size_t sz, payload_sz = req_sz - sizeof(struct virtio_iommu_req_tail); > +size_t sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz); > > -sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz); > if (unlikely(sz != payload_sz)) { > return VIRTIO_IOMMU_S_INVAL; > } > @@ -692,7 +691,8 @@ static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, > \ > unsigned int iov_cnt) \ > { \ > struct virtio_iommu_req_ ## __req req; \ > -int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req)); \ > +int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, \ > +sizeof(req) - sizeof(struct virtio_iommu_req_tail));\ > \ > return ret ? ret : virtio_iommu_ ## __req(s, &req); \ > } > -- > 2.25.1 >
Re: virtio-iommu hotplug issue
Hello, On Thu, Apr 13, 2023 at 01:49:43PM +0900, Akihiko Odaki wrote: > Hi, > > Recently I encountered a problem with the combination of Linux's > virtio-iommu driver and QEMU when a SR-IOV virtual function gets disabled. > I'd like to ask you what kind of solution is appropriate here and implement > the solution if possible. > > A PCIe device implementing the SR-IOV specification exports a virtual > function, and the guest can enable or disable it at runtime by writing to a > configuration register. This effectively looks like a PCI device is > hotplugged for the guest. Just so I understand this better: the guest gets a whole PCIe device PF that implements SR-IOV, and so the guest can dynamically create VFs? Out of curiosity, is that a hardware device assigned to the guest with VFIO, or a device emulated by QEMU? > In such a case, the kernel assumes the endpoint is > detached from the virtio-iommu domain, but QEMU actually does not detach it. > > This inconsistent view of the removed device sometimes prevents the VM from > correctly performing the following procedure, for example: > 1. Enable a VF. > 2. Disable the VF. > 3. Open a vfio container. > 4. Open the group which the PF belongs to. > 5. Add the group to the vfio container. > 6. Map some memory region. > 7. Close the group. > 8. Close the vfio container. > 9. Repeat 3-8 > > When the VF gets disabled, the kernel assumes the endpoint is detached from > the IOMMU domain, but QEMU actually doesn't detach it. Later, the domain > will be reused in step 3-8. > > In step 7, the PF will be detached, and the kernel thinks there is no > endpoint attached and the mapping the domain holds is cleared, but the VF > endpoint is still attached and the mapping is kept intact. > > In step 9, the same domain will be reused again, and the kernel requests to > create a new mapping, but it will conflict with the existing mapping and > result in -EINVAL. > > This problem can be fixed by either of: > - requesting the detachment of the endpoint from the guest when the PCI > device is unplugged (the VF is disabled) Yes, I think this is an issue in the virtio-iommu driver, which should be sending a DETACH request when the VF is disabled, likely from viommu_release_device(). I'll work on a fix unless you would like to do it > - detecting that the PCI device is gone and automatically detach it on > QEMU-side. > > It is not completely clear for me which solution is more appropriate as the > virtio-iommu specification is written in a way independent of the endpoint > mechanism and does not say what should be done when a PCI device is > unplugged. Yes, I'm not sure it's in scope for the specification, it's more about software guidance Thanks, Jean
Re: virtio-iommu hotplug issue
On Thu, Apr 13, 2023 at 08:01:54PM +0900, Akihiko Odaki wrote: > Yes, that's right. The guest can dynamically create and delete VFs. The > device is emulated by QEMU: igb, an Intel NIC recently added to QEMU and > projected to be released as part of QEMU 8.0. Ah great, that's really useful, I'll add it to my tests > > Yes, I think this is an issue in the virtio-iommu driver, which should be > > sending a DETACH request when the VF is disabled, likely from > > viommu_release_device(). I'll work on a fix unless you would like to do it > > It will be nice if you prepare a fix. I will test your patch with my > workload if you share it with me. I sent a fix: https://lore.kernel.org/linux-iommu/20230414150744.562456-1-jean-phili...@linaro.org/ Thank you for reporting this, it must have been annoying to debug Thanks, Jean
[PATCH] kvm: Merge kvm_check_extension() and kvm_vm_check_extension()
The KVM_CHECK_EXTENSION ioctl can be issued either on the global fd (/dev/kvm), or on the VM fd obtained with KVM_CREATE_VM. For most extensions, KVM returns the same value with either method, but for some of them it can refine the returned value depending on the VM type. The KVM documentation [1] advises to use the VM fd: Based on their initialization different VMs may have different capabilities. It is thus encouraged to use the vm ioctl to query for capabilities (available with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) Ongoing work on Arm confidential VMs confirms this, as some capabilities become unavailable to confidential VMs, requiring changes in QEMU to use kvm_vm_check_extension() instead of kvm_check_extension() [2]. Rather than changing each check one by one, change kvm_check_extension() to always issue the ioctl on the VM fd when available, and remove kvm_vm_check_extension(). Fall back to the global fd when the VM check is unavailable: * Ancient kernels do not support KVM_CHECK_EXTENSION on the VM fd, since it was added by commit 92b591a4c46b ("KVM: Allow KVM_CHECK_EXTENSION on the vm fd") in Linux 3.17 [3]. Support for Linux 3.16 ended only in June 2020, but there may still be old images around. * A couple of calls must be issued before the VM fd is available, since they determine the VM type: KVM_CAP_MIPS_VZ and KVM_CAP_ARM_VM_IPA_SIZE Does any user actually depend on the check being done on the global fd instead of the VM fd? I surveyed all cases where KVM can return different values depending on the query method. Luckily QEMU already calls kvm_vm_check_extension() for most of those. Only three of them are ambiguous, because currently done on the global fd: * KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID on Arm, changes value if the user requests a vGIC different from the default. But QEMU queries this before vGIC configuration, so the reported value will be the same. * KVM_CAP_SW_TLB on PPC. When issued on the global fd, returns false if the kvm-hv module is loaded; when issued on the VM fd, returns false only if the VM type is HV instead of PR. If this returns false, then QEMU will fail to initialize a BOOKE206 MMU model. So this patch supposedly improves things, as it allows to run this type of vCPU even when both KVM modules are loaded. * KVM_CAP_PPC_SECURE_GUEST. Similarly, doing this check on a VM fd refines the returned value, and ensures that SVM is actually supported. Since QEMU follows the check with kvm_vm_enable_cap(), this patch should only provide better error reporting. [1] https://www.kernel.org/doc/html/latest/virt/kvm/api.html#kvm-check-extension [2] https://lore.kernel.org/kvm/875ybi0ytc@redhat.com/ [3] https://github.com/torvalds/linux/commit/92b591a4c46b Suggested-by: Cornelia Huck Signed-off-by: Jean-Philippe Brucker --- include/sysemu/kvm.h | 2 -- include/sysemu/kvm_int.h | 1 + accel/kvm/kvm-all.c | 26 +- target/i386/kvm/kvm.c| 6 +++--- target/ppc/kvm.c | 34 +- 5 files changed, 30 insertions(+), 39 deletions(-) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index c8281c07a7..d62054004e 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -438,8 +438,6 @@ bool kvm_arch_stop_on_emulation_error(CPUState *cpu); int kvm_check_extension(KVMState *s, unsigned int extension); -int kvm_vm_check_extension(KVMState *s, unsigned int extension); - #define kvm_vm_enable_cap(s, capability, cap_flags, ...) \ ({ \ struct kvm_enable_cap cap = {\ diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index a641c974ea..f6aa22ea09 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -122,6 +122,7 @@ struct KVMState uint32_t xen_caps; uint16_t xen_gnttab_max_frames; uint16_t xen_evtchn_max_pirq; +bool check_extension_vm; }; void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index cf3a88d90e..eca815e763 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1109,22 +1109,13 @@ int kvm_check_extension(KVMState *s, unsigned int extension) { int ret; -ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); -if (ret < 0) { -ret = 0; +if (!s->check_extension_vm) { +ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); +} else { +ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); } - -return ret; -} - -int kvm_vm_check_extension(KVMState *s, unsigned int extension) -{ -int ret; - -ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); if (ret < 0) { -/* VM wide version not implemented, use global one instead */ -ret = kvm_check_extension(s, extension); +ret = 0;
Re: [PATCH] kvm: Merge kvm_check_extension() and kvm_vm_check_extension()
On Mon, Apr 24, 2023 at 03:01:54PM +0200, Cornelia Huck wrote: > > @@ -2480,6 +2471,7 @@ static int kvm_init(MachineState *ms) > > } > > > > s->vmfd = ret; > > +s->check_extension_vm = kvm_check_extension(s, > > KVM_CAP_CHECK_EXTENSION_VM); > > Hm, it's a bit strange to set s->check_extension_vm by calling a > function that already checks for the value of > s->check_extension_vm... would it be better to call kvm_ioctl() directly > on this line? Yes that's probably best. I'll use kvm_vm_ioctl() since the doc suggests to check KVM_CAP_CHECK_EXTENSION_VM on the vm fd. Thanks, Jean > > I think it would be good if some ppc folks could give this a look, but > in general, it looks fine to me. >