Hi Jason,
On 5/1/24 08:57, Jason Chien wrote:
Daniel Henrique Barboza 於 2024/3/8 上午 12:03 寫道:
From: Tomasz Jeznach<tjezn...@rivosinc.com>
The RISC-V IOMMU specification is now ratified as-per the RISC-V
international process. The latest frozen specifcation can be found
at:
https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
Add the foundation of the device emulation for RISC-V IOMMU, which
includes an IOMMU that has no capabilities but MSI interrupt support and
fault queue interfaces. We'll add add more features incrementally in the
next patches.
Co-developed-by: Sebastien Boeuf<s...@rivosinc.com>
Signed-off-by: Sebastien Boeuf<s...@rivosinc.com>
Signed-off-by: Tomasz Jeznach<tjezn...@rivosinc.com>
Signed-off-by: Daniel Henrique Barboza<dbarb...@ventanamicro.com>
---
hw/riscv/Kconfig | 4 +
hw/riscv/meson.build | 1 +
hw/riscv/riscv-iommu.c | 1492 ++++++++++++++++++++++++++++++++++++++
hw/riscv/riscv-iommu.h | 141 ++++
hw/riscv/trace-events | 11 +
hw/riscv/trace.h | 2 +
include/hw/riscv/iommu.h | 36 +
meson.build | 1 +
8 files changed, 1688 insertions(+)
create mode 100644 hw/riscv/riscv-iommu.c
create mode 100644 hw/riscv/riscv-iommu.h
create mode 100644 hw/riscv/trace-events
create mode 100644 hw/riscv/trace.h
create mode 100644 include/hw/riscv/iommu.h
diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index 5d644eb7b1..faf6a10029 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -1,3 +1,6 @@
+config RISCV_IOMMU
+ bool
+
(...)
+
+/* IOMMU index for transactions without PASID specified. */
+#define RISCV_IOMMU_NOPASID 0
+
+static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
+{
+ const uint32_t ipsr =
+ riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
+ const uint32_t ivec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_IVEC);
+ if (s->notify && !(ipsr & (1 << vec))) {
+ s->notify(s, (ivec >> (vec * 4)) & 0x0F);
+ }
+}
The RISC-V IOMMU also supports WSI.
+
I mentioned in the review with Frank that this impl does not support WSI, but
it really seems clearer to do the check here nevertheless. I'll add it.
+static void riscv_iommu_fault(RISCVIOMMUState *s,
+ struct riscv_iommu_fq_record *ev)
+{
(...)
+
+ /*
+ * Check supported device id width (in bits).
+ * See IOMMU Specification, Chapter 6. Software guidelines.
+ * - if extended device-context format is used:
+ * 1LVL: 6, 2LVL: 15, 3LVL: 24
+ * - if base device-context format is used:
+ * 1LVL: 7, 2LVL: 16, 3LVL: 24
+ */
+ if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
+ return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
The cause should be 260 not 258.
From the RISC-V IOMMU Architecture Spec v1.0.0 section 2.3:
If the device_id is wider than that supported by the IOMMU mode, as determined by the
following checks then stop and report "Transaction type disallowed" (cause =
260).
a. ddtp.iommu_mode is 2LVL and DDI[2] is not 0
b. ddtp.iommu_mode is 1LVL and either DDI[2] is not 0 or DDI[1] is not 0
Changed.
+ }
+
+ /* Device directory tree walk */
+ for (; depth-- > 0; ) {
+ /*
+ * Select device id index bits based on device directory tree level
+ * and device context format.
+ * See IOMMU Specification, Chapter 2. Data Structures.
+ * - if extended device-context format is used:
+ * device index: [23:15][14:6][5:0]
+ * - if base device-context format is used:
+ * device index: [23:16][15:7][6:0]
+ */
+ const int split = depth * 9 + 6 + dc_fmt;
+ addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
+ if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+ MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+ return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+ }
+ le64_to_cpus(&de);
+ if (!(de & RISCV_IOMMU_DDTE_VALID)) {
+ /* invalid directory entry */
+ return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+ }
+ if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
+ /* reserved bits set */
+ return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
The cause should be 259 not 258.
From RISC-V IOMMU Architecture Spec v1.0.0 section 2.3.1:
If any bits or encoding that are reserved for future standard use are set within ddte,
stop and report "DDT entry misconfigured" (cause = 259).
Changed
+ }
+ addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
+ }
+
+ /* index into device context entry page */
+ addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
+
+ memset(&dc, 0, sizeof(dc));
+ if (dma_memory_read(s->target_as, addr, &dc, dc_len,
+ MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+ return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+ }
+
+ /* Set translation context. */
+ ctx->tc = le64_to_cpu(dc.tc);
+ ctx->ta = le64_to_cpu(dc.ta);
+ ctx->msiptp = le64_to_cpu(dc.msiptp);
+ ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
+ ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
+
According to RISC-V IOMMU Architecture spec v1.0.0 section 2.1.4, we should do
some checks for the found device context.
I added a new helper to validate the device context at this point, following
section 2.1.4 steps.
+ if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+ return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+ }
+
+ if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
+ if (ctx->pasid != RISCV_IOMMU_NOPASID) {
+ /* PASID is disabled */
+ return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+ }
+ return 0;
+ }
+
(...)
+
+static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
+{
+ uint64_t base;
+ uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+ uint32_t ctrl_clr;
+ bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
+ bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
+
+ if (enable && !active) {
+ base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
+ s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
+ s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
+ stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
+ stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
+ stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
+ ctrl_set = RISCV_IOMMU_CQCSR_CQON;
+ ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
+ RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO;
cqcsr.fence_w_ip should be set to 0 as well.
Done.
+ } else if (!enable && active) {
+ stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
+ ctrl_set = 0;
+ ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
+ } else {
(...)
+}
+
+static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+ RISCVIOMMUState *s = opaque;
+ uint32_t regb = addr & ~3;
+ uint32_t busy = 0;
+ uint32_t exec = 0;
+
+ if (size == 0 || size > 8 || (addr & (size - 1)) != 0) {
+ /* Unsupported MMIO alignment or access size */
+ return MEMTX_ERROR;
+ }
+
+ if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+ /* Unsupported MMIO access location. */
+ return MEMTX_ACCESS_ERROR;
+ }
+
+ /* Track actionable MMIO write. */
+ switch (regb) {
There should be a case for IPSR register.
From RISC-V IOMMU Architecture Spec v1.0.0 section 5.18:
If a bit in ipsr is 1 then a write of 1 to the bit transitions the bit from
1→0. If the conditions to set that bit are still present (See [IPSR_FIELDS]) or
if they occur after the bit is cleared then that bit transitions again from 0→1.
A new helper to handle ipsr updates via mmio_write was created.
+ case RISCV_IOMMU_REG_DDTP:
+ case RISCV_IOMMU_REG_DDTP + 4:
+ exec = BIT(RISCV_IOMMU_EXEC_DDTP);
(...)
+static void riscv_iommu_realize(DeviceState *dev, Error **errp)
+{
+ RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+ s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
+ if (s->enable_msi) {
+ s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
+ }
+ /* Report QEMU target physical address space limits */
+ s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
+ TARGET_PHYS_ADDR_SPACE_BITS);
+
+ /* TODO: method to report supported PASID bits */
+ s->pasid_bits = 8; /* restricted to size of MemTxAttrs.pasid */
+ s->cap |= RISCV_IOMMU_CAP_PD8;
+
+ /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
+ s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
+ RISCV_IOMMU_DDTP_MODE_OFF :
RISCV_IOMMU_DDTP_MODE_BARE);
+
+ /* register storage */
+ s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+ s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+ s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+
+ /* Mark all registers read-only */
+ memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
+
+ /*
+ * Register complete MMIO space, including MSI/PBA registers.
+ * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
+ * managed directly by the PCIDevice implementation.
+ */
+ memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
+ "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
+
+ /* Set power-on register state */
+ stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
+ stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], s->fctl);
s->fctl is not initialized.
I believe the idea is to init it as zero. I'll change it to init as zero
explicitly.
+ stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
+ ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
+ stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
+ ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
+ stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
(...)
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+ Error **errp)
+{
+ if (bus->iommu_ops &&
+ bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
+ /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
+ RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
+ QLIST_INSERT_AFTER(last, iommu, iommus);
+ } else if (bus->iommu_ops == NULL) {
+ pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
The original bus->iommu_op and bus->iommu_opaque will be lost.
Not sure what you meant with 'iommu_op'. We have 'iommu_ops', which is being
checked
for NULL before calling pci_setup_iommu().
As for overwriting the original bus->iommu_opaque, I added an extra
iommu_opaque == NULL
check. We'll make:
} else if (!bus->iommu_ops && !bus->iommu_opaque) {
pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
This will guarantee that we're not overwriting any existing ops or opaque by
accident.
Thanks,
Daniel