The existing IOMMU page table implementations duplicate all of the working algorithms for each format. By using the generic page table API a single C version of the IOMMU algorithms can be created and re-used for all of the different formats used in the drivers. The implementation will provide a single C version of the iommu domain operations: iova_to_phys, map, unmap, and read_and_clear_dirty.
Further, adding new algorithms and techniques becomes easy to do across the entire fleet of drivers and formats. The C functions are drop in compatible with the existing iommu_domain_ops using the IOMMU_PT_DOMAIN_OPS() macro. Each per-format implementation compilation unit will produce exported symbols following the pattern pt_iommu_FMT_map_pages() which the macro directly maps to the iommu_domain_ops members. This avoids the additional function pointer indirection like io-pgtable has. The top level struct used by the drivers is pt_iommu_table_FMT. It contains the other structs to allow container_of() to move between the driver, iommu page table, generic page table, and generic format layers. struct pt_iommu_table_amdv1 { struct pt_iommu { struct iommu_domain domain; } iommu; struct pt_amdv1 { struct pt_common { } common; } amdpt; }; The driver is expected to union the pt_iommu_table_FMT with it's own existing domain struct: struct driver_domain { union { struct iommu_domain domain; struct pt_iommu_table_amdv1 amdv1; }; }; PT_IOMMU_CHECK_DOMAIN(struct driver_domain, amdv1, domain); To create an alias to avoid renaming 'domain' in a lot of driver code. This allows all the layers to access all the necessary functions to implement their different roles with no change to any of the existing iommu core code. Implement the basic starting point: pt_iommu_init(), get_info() and deinit(). Signed-off-by: Jason Gunthorpe <j...@nvidia.com> --- drivers/iommu/generic_pt/Kconfig | 13 + drivers/iommu/generic_pt/fmt/iommu_template.h | 39 +++ drivers/iommu/generic_pt/iommu_pt.h | 265 ++++++++++++++++++ include/linux/generic_pt/iommu.h | 112 ++++++++ 4 files changed, 429 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h create mode 100644 drivers/iommu/generic_pt/iommu_pt.h create mode 100644 include/linux/generic_pt/iommu.h diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index 775a3afb563f72..73b7a54375f9bd 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -19,4 +19,17 @@ config DEBUG_GENERIC_PT kernels. The kunit tests require this to be enabled to get full coverage. + +config IOMMU_PT + tristate "IOMMU Page Tables" + select IOMMU_API + depends on IOMMU_SUPPORT + depends on GENERIC_PT + default n + help + Generic library for building IOMMU page tables + + IOMMU_PT provides an implementation of the page table operations + related struct iommu_domain using GENERIC_PT to abstract the page + table format. endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h new file mode 100644 index 00000000000000..5b631bc07cbc16 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Template to build the iommu module and kunit from the format and + * implementation headers. + * + * The format should have: + * #define PT_FMT <name> + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy)) + * And optionally: + * #define PT_FORCE_ENABLED_FEATURES .. + * #define PT_FMT_VARIANT <suffix> + */ +#include <linux/args.h> +#include <linux/stringify.h> + +#ifdef PT_FMT_VARIANT +#define PTPFX_RAW \ + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT) +#else +#define PTPFX_RAW PT_FMT +#endif + +#define PTPFX CONCATENATE(PTPFX_RAW, _) + +#define _PT_FMT_H PT_FMT.h +#define PT_FMT_H __stringify(_PT_FMT_H) + +#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H) +#define PT_DEFS_H __stringify(_PT_DEFS_H) + +#include <linux/generic_pt/common.h> +#include PT_DEFS_H +#include "../pt_defs.h" +#include PT_FMT_H +#include "../pt_common.h" + +#include "../iommu_pt.h" diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h new file mode 100644 index 00000000000000..4c228689b0a3d0 --- /dev/null +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * "Templated C code" for implementing the iommu operations for page tables. + * This is compiled multiple times, over all the page table formats to pick up + * the per-format definitions. + */ +#ifndef __GENERIC_PT_IOMMU_PT_H +#define __GENERIC_PT_IOMMU_PT_H + +#include "pt_iter.h" + +#include <linux/iommu.h> +#include "../iommu-pages.h" +#include <linux/export.h> + +#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) + +struct pt_iommu_collect_args { + struct iommu_pages_list free_list; + u8 ignore_mapped : 1; +}; + +static int __collect_tables(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_collect_args *collect = arg; + int ret; + + if (collect->ignore_mapped && !pt_can_have_table(&pts)) + return 0; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + iommu_pages_list_add(&collect->free_list, pts.table_lower); + ret = pt_descend(&pts, arg, __collect_tables); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && !collect->ignore_mapped) + return -EADDRINUSE; + } + return 0; +} + +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, + uintptr_t top_of_table, + gfp_t gfp) +{ + struct pt_iommu *iommu_table = iommu_from_common(common); + + /* + * Top doesn't need the free list or otherwise, so it technically + * doesn't need to use iommu pages. Use the API anyhow as the top is + * usually not smaller than PAGE_SIZE to keep things simple. + */ + return iommu_alloc_pages_node_sz( + iommu_table->nid, gfp, + log2_to_int(pt_top_memsize_lg2(common, top_of_table))); +} + +static void NS(get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_top_range(common); + struct pt_state pts = pt_init_top(&range); + pt_vaddr_t pgsize_bitmap = 0; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) { + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL; + pts.level++) { + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) + break; + pgsize_bitmap |= pt_possible_sizes(&pts); + } + } else { + for (pts.level = 0; pts.level <= range.top_level; pts.level++) + pgsize_bitmap |= pt_possible_sizes(&pts); + } + + /* Hide page sizes larger than the maximum OA */ + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2); +} + +static void NS(deinit)(struct pt_iommu *iommu_table) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + .ignore_mapped = true, + }; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + /* + * The driver has to already have fenced the HW access to the page table + * and invalidated any caching referring to this memory. + */ + iommu_put_pages_list(&collect.free_list); +} + +static const struct pt_iommu_ops NS(ops) = { + .get_info = NS(get_info), + .deinit = NS(deinit), +}; + +static int pt_init_common(struct pt_common *common) +{ + struct pt_range top_range = pt_top_range(common); + + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL)) + return -EINVAL; + + if (top_range.top_level == PT_MAX_TOP_LEVEL || + common->max_vasz_lg2 == top_range.max_vasz_lg2) + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP); + + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2) + common->features |= BIT(PT_FEAT_FULL_VA); + + /* Requested features must match features compiled into this format */ + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) || + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) && + (common->features & PT_FORCE_ENABLED_FEATURES) != + PT_FORCE_ENABLED_FEATURES)) + return -EOPNOTSUPP; + + if (common->max_oasz_lg2 == 0) + common->max_oasz_lg2 = pt_max_output_address_lg2(common); + else + common->max_oasz_lg2 = min(common->max_oasz_lg2, + pt_max_output_address_lg2(common)); + return 0; +} + +static int pt_iommu_init_domain(struct pt_iommu *iommu_table, + struct iommu_domain *domain) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_iommu_info info; + struct pt_range range; + + NS(get_info)(iommu_table, &info); + + domain->type = __IOMMU_DOMAIN_PAGING; + domain->pgsize_bitmap = info.pgsize_bitmap; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + range = _pt_top_range(common, + _pt_top_set(NULL, PT_MAX_TOP_LEVEL)); + else + range = pt_top_range(common); + + /* + * A 64 bit high address space table on a 32 bit system cannot work. + */ + domain->geometry.aperture_start = (unsigned long)range.va; + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va || + range.va > ULONG_MAX) + return -EOVERFLOW; + + /* + * The aperture is limited to what the API can do after considering all + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used + * to store a VA. Set the aperture to something that is valid for all + * cases. Saturate instead of truncate the end if the types are smaller + * than the top range. aperture_end is a last. + */ + domain->geometry.aperture_end = (unsigned long)range.last_va; + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) { + domain->geometry.aperture_end = ULONG_MAX; + domain->pgsize_bitmap &= ULONG_MAX; + } + + return 0; +} + +static void pt_iommu_zero(struct pt_iommu_table *fmt_table) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_iommu cfg = *iommu_table; + + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0); + memset_after(fmt_table, 0, iommu.domain); + + /* The caller can initialize some of these values */ + iommu_table->nid = cfg.nid; +} + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_table_p *table_mem; + int ret; + + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 || + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2) + return -EINVAL; + + pt_iommu_zero(fmt_table); + common->features = cfg->common.features; + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2; + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2; +#ifdef PT_FIXED_TOP_LEVEL + pt_top_set_level(common, PT_FIXED_TOP_LEVEL); +#endif + ret = pt_iommu_fmt_init(fmt_table, cfg); + if (ret) + return ret; + + if (cfg->common.hw_max_oasz_lg2 > pt_max_output_address_lg2(common)) + return -EINVAL; + + ret = pt_init_common(common); + if (ret) + return ret; + + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && + (pt_feature(common, PT_FEAT_FULL_VA) || + pt_feature(common, PT_FEAT_DYNAMIC_TOP))) + return -EINVAL; + + iommu_table->ops = &NS(ops); + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); + if (ret) + return ret; + + table_mem = table_alloc_top(common, common->top_of_table, gfp); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + pt_top_set(common, table_mem, pt_top_get_level(common)); + return 0; +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU"); + +#ifdef pt_iommu_fmt_hw_info +#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info) +#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info) +void pt_iommu_hw_info(struct pt_iommu_table *fmt_table, + struct pt_iommu_table_hw_info *info) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range top_range = pt_top_range(common); + + pt_iommu_fmt_hw_info(fmt_table, &top_range, info); +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); +#endif + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IOMMU Pagetable implementation for " __stringify(PTPFX_RAW)); +MODULE_IMPORT_NS("GENERIC_PT"); + +#endif diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h new file mode 100644 index 00000000000000..636f856cc5fdcc --- /dev/null +++ b/include/linux/generic_pt/iommu.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_IOMMU_H +#define __GENERIC_PT_IOMMU_H + +#include <linux/generic_pt/common.h> +#include <linux/iommu.h> +#include <linux/mm_types.h> + +struct pt_iommu_ops; + +/** + * DOC: IOMMU Radix Page Table + * + * The iommu implementation of the Generic Page Table provides an ops struct + * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and + * the generic map/unmap interface. + * + * This interface uses a caller provided locking approach. The caller must have + * a VA range lock concept that prevents concurrent threads from calling ops on + * the same VA. Generally the range lock must be at least as large as a single + * map call. + */ + +/** + * struct pt_iommu - Base structure for iommu page tables + * + * The format specific struct will include this as the first member. + */ +struct pt_iommu { + /** + * @domain - The core iommu domain. The driver should use a union to + * overlay this memory with its previously existing domain struct to + * create an alias. + */ + struct iommu_domain domain; + + /** + * @ops - Function pointers to access the API + */ + const struct pt_iommu_ops *ops; + + /** + * @nid - Node ID to use for table memory allocations. The iommu driver + * may want to set the NID to the device's NID, if there are multiple + * table walkers. + */ + int nid; +}; + +/** + * struct pt_iommu_info - Details about the iommu page table + * + * Returned from pt_iommu_ops->get_info() + */ +struct pt_iommu_info { + /** + * @pgsize_bitmap - A bitmask where each set bit indicates + * a page size that can be natively stored in the page table. + */ + u64 pgsize_bitmap; +}; + +struct pt_iommu_ops { + /** + * get_info() - Return the pt_iommu_info structure + * @iommu_table: Table to query + * + * Return some basic static information about the page table. + */ + void (*get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info); + + /** + * deinit() - Undo a format specific init operation + * @iommu_table: Table to destroy + * + * Release all of the memory. The caller must have already removed the + * table from all HW access and all caches. + */ + void (*deinit)(struct pt_iommu *iommu_table); +}; + +static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) +{ + iommu_table->ops->deinit(iommu_table); +} + +/** + * struct pt_iommu_cfg - Common configuration values for all formats + */ +struct pt_iommu_cfg { + /** + * @features - Features required. Only these features will be turned on. + * The feature list should reflect what the IOMMU HW is capable of. + */ + unsigned int features; + /** + * @hw_max_vasz_lg2 - Maximum VA the IOMMU HW can support. This will + * imply the top level of the table. + */ + u8 hw_max_vasz_lg2; + /** + * @hw_max_oasz_lg2 - Maximum OA the IOMMU HW can support. The format + * might select a lower maximum OA. + */ + u8 hw_max_oasz_lg2; +}; + +#endif -- 2.43.0