On 8/26/25 10:18 AM, Jason Gunthorpe wrote:

> Tested-by: Alejandro Jimenez <[email protected]>
> Signed-off-by: Jason Gunthorpe <[email protected]>
> ---
>  .clang-format                              |   1 +
>  drivers/iommu/Kconfig                      |   2 +
>  drivers/iommu/generic_pt/Kconfig           |  20 +
>  drivers/iommu/generic_pt/pt_common.h       | 354 ++++++++++++
>  drivers/iommu/generic_pt/pt_defs.h         | 323 +++++++++++
>  drivers/iommu/generic_pt/pt_fmt_defaults.h | 193 +++++++
>  drivers/iommu/generic_pt/pt_iter.h         | 636 +++++++++++++++++++++
>  drivers/iommu/generic_pt/pt_log2.h         | 130 +++++
>  include/linux/generic_pt/common.h          | 134 +++++
>  9 files changed, 1793 insertions(+)
>  create mode 100644 drivers/iommu/generic_pt/Kconfig
>  create mode 100644 drivers/iommu/generic_pt/pt_common.h
>  create mode 100644 drivers/iommu/generic_pt/pt_defs.h
>  create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h
>  create mode 100644 drivers/iommu/generic_pt/pt_iter.h
>  create mode 100644 drivers/iommu/generic_pt/pt_log2.h
>  create mode 100644 include/linux/generic_pt/common.h
> 

> diff --git a/drivers/iommu/generic_pt/pt_common.h 
> b/drivers/iommu/generic_pt/pt_common.h
> new file mode 100644
> index 00000000000000..5ed06104d38b45
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/pt_common.h
> @@ -0,0 +1,354 @@

[snip]

> +/**
> + * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry
> + * @pts: Entry to query
> + *
> + * Returns the number of contiguous items this leaf entry spans. If the 
> entry is

 * Returns:

> + * single item it returns ilog2(1).
> + */
> +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state 
> *pts);
> +
> +/**
> + * pt_entry_oa() - Output Address for this leaf entry
> + * @pts: Entry to query
> + *
> + * Return the output address for the start of the entry. If the entry

 * Return:
or
 * Returns:
(usually last in the kernel-doc comment block)

> + * is contigous this returns the same value for each sub-item. Ie::

         contiguous                                               I.e.::

> + *
> + *    log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0
> + *
> + * See pt_item_oa(). The format should implement one of these two functions
> + * depending on how it stores the OA's in the table.

or                                   OAs

> + */
> +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts);
> +
> +/**
> + * pt_entry_oa_lg2sz() - Return the size of a OA entry
> + * @pts: Entry to query
> + *
> + * If the entry is not contigous this returns pt_table_item_lg2sz(), 
> otherwise

                          contiguous

> + * it returns the total VA/OA size of the entire contiguous entry.

Modify to use
 * Returns:
?

> + */
> +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts)
> +{
> +     return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
> +}
> +
> +/**
> + * pt_entry_oa_full() - Return the full OA for an entry
> + * @pts: Entry to query
> + *
> + * During iteration the first entry could have a VA with an offset from the
> + * natural start of the entry. Return the true full OA considering the pts's 
> VA
> + * offset.

 *
 * Returns: the true full OA considering the pts's VA offset

> + */
> +static inline pt_oaddr_t pt_entry_oa_full(const struct pt_state *pts)
> +{
> +     return _pt_entry_oa_fast(pts) |
> +            log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts));
> +}
> +
> +/**
> + * pt_entry_set_write_clean() - Make the entry write clean
> + * @pts: Table index to change
> + *
> + * Modify the entry so that pt_entry_write_is_dirty() == false. The HW will
> + * eventually be notified of this change via a TLB flush, which is the point
> + * that the HW must become synchronized. Any "write dirty" prior to the TLB
> + * flush can be lost, but once the TLB flush completes all writes must make
> + * their entries write dirty.
> + *
> + * The format should alter the entry in a way that is compatible with any
> + * concurrent update from HW. The entire contiguous entry is changed.
> + */
> +static inline void pt_entry_set_write_clean(struct pt_state *pts);
> +
> +/**
> + * pt_entry_write_is_dirty() - True if the entry has been written to
> + * @pts: Entry to query
> + *
> + * "write dirty" means that the HW has written to the OA translated
> + * by this entry. If the entry is contiguous then the consolidated
> + * "write dirty" for all the items must be returned.
> + */
> +static inline bool pt_entry_write_is_dirty(const struct pt_state *pts);
> +
> +/**
> + * pt_full_va_prefix() - The top bits of the VA
> + * @common: Page table to query
> + *
> + * This is usually 0, but some formats have their VA space going downward 
> from
> + * PT_VADDR_MAX, and will return that instead. This value must always be
> + * adjusted by struct pt_common max_vasz_lg2.
> + */
> +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common);
> +
> +/**
> + * pt_has_system_page() - True if level 0 can install a PAGE_SHIFT entry
> + * @common: Page table to query
> + *
> + * If true the caller use at level 0 pt_install_leaf_entry(PAGE_SHIFT). This 
> is

                         uses
?
although it might just be missing a word or two? I can't tell.

> + * useful to create optimized paths for common cases of PAGE_SIZE mappings.
> + */
> +static inline bool pt_has_system_page(const struct pt_common *common);
> +
> +/**
> + * pt_install_leaf_entry() - Write a leaf entry to the table
> + * @pts: Table index to change
> + * @oa: Output Address for this leaf
> + * @oasz_lg2: Size in VA for this leaf
> + * @attrs: Attributes to modify the entry
> + *
> + * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It 
> translates
> + * the VA indicated by pts to the given OA.
> + *
> + * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz().
> + * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2.
> + *
> + * This must not be called if pt_can_have_leaf() == false. Contigous sizes

                                                              Contiguous

> + * not indicated by pt_possible_sizes() must not be specified.
> + */
> +static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
> +                                      unsigned int oasz_lg2,
> +                                      const struct pt_write_attrs *attrs);
> +
> +/**
> + * pt_install_table() - Write a table entry to the table
> + * @pts: Table index to change
> + * @table_pa: CPU physical address of the lower table's memory
> + * @attrs: Attributes to modify the table index
> + *
> + * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The 
> table_pa
> + * is the table at pts->level - 1. This is done by cmpxchg so pts must have 
> the
> + * current entry loaded. The pts is updated with the installed entry.
> + *
> + * This must not be called if pt_can_have_table() == false.
> + *
> + * Returns true if the table was installed successfully.

 * Returns:

> + */
> +static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t 
> table_pa,
> +                                 const struct pt_write_attrs *attrs);
> +
> +/**
> + * pt_item_oa() - Output Address for this leaf item
> + * @pts: Item to query
> + *
> + * Return the output address for this item. If the item is part of a 
> contiguous

 * Return:

> + * entry it returns the value of the OA for this individual sub item.
> + *
> + * See pt_entry_oa(). The format should implement one of these two functions
> + * depending on how it stores the OA's in the table.
> + */
> +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts);
> +
> +/**
> + * pt_load_entry_raw() - Read from the location pts points at into the pts
> + * @pts: Table index to load
> + *
> + * Return the type of entry that was loaded. pts->entry will be filled in 
> with

 * Return:

> + * the entry's content. See pt_load_entry()
> + */
> +static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts);
> +
> +/**
> + * pt_max_output_address_lg2() - Return the maximum OA the table format can 
> hold
> + * @common: Page table to query
> + *
> + * The value oalog2_to_max_int(pt_max_output_address_lg2()) is the MAX for 
> the
> + * OA. This is the absolute maximum address the table can hold. struct 
> pt_common
> + * max_oasz_lg2 sets a lower dynamic maximum based on HW capability.
> + */
> +static inline unsigned int
> +pt_max_output_address_lg2(const struct pt_common *common);
> +
> +/**
> + * pt_num_items_lg2() - Return the number of items in this table level
> + * @pts: The current level
> + *
> + * The number of items in a table level defines the number of bits this level
> + * decodes from the VA. This function is not called for the top level,
> + * so it does not need to compute a special value for the top case. The
> + * result for the top is based on pt_common max_vasz_lg2.
> + *
> + * The value is used as part if determining the table indexes via the

                           part of
?

> + * equation::
> + *
> + *   log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2())
> + */
> +static inline unsigned int pt_num_items_lg2(const struct pt_state *pts);
> +
> +/**
> + * pt_pgsz_lg2_to_level - Return the level that maps the page size
> + * @common: Page table to query
> + * @pgsize_lg2: Log2 page size
> + *
> + * Returns the table level that will map the given page size. The page

 * Returns:

> + * size must be part of the pt_possible_sizes() for some level.
> + */
> +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
> +                                             unsigned int pgsize_lg2);
> +
> +/**
> + * pt_possible_sizes() - Return a bitmap of possible output sizes at this 
> level
> + * @pts: The current level
> + *
> + * Each level has a list of possible output sizes that can be installed as
> + * leaf entries. If pt_can_have_leaf() is false returns zero.
> + *
> + * Otherwise the bit in position pt_table_item_lg2sz() should be set 
> indicating
> + * that a non-contigous singe item leaf entry is supported. The following

             non-contiguous
Also, is that               single
?
or is "singe" a real word here? (IDK.)

> + * pt_num_items_lg2() number of bits can be set indicating contiguous entries
> + * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be
> + * set, contiguous entries cannot span the entire table.
> + *
> + * The OR of pt_possible_sizes() of all levels is the typical bitmask of all
> + * supported sizes in the entire table.
> + */
> +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts);
> +
> +/**
> + * pt_table_item_lg2sz() - Size of a single item entry in this table level
> + * @pts: The current level
> + *
> + * The size of the item specifies how much VA and OA a single item occupies.
> + *
> + * See pt_entry_oa_lg2sz() for the same value including the effect of 
> contiguous
> + * entries.
> + */
> +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
> +
> +/**
> + * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table
> + * @pts: The current level
> + *
> + * Return the size of VA decoded by the entire table level.

 * Return:

> + */
> +static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts)
> +{
> +     if (pts->range->top_level == pts->level)
> +             return pts->range->max_vasz_lg2;
> +     return min_t(unsigned int, pts->range->common->max_vasz_lg2,
> +                  pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts));
> +}
> +
> +/**
> + * pt_table_pa() - Return the CPU physical address of the table entry
> + * @pts: Entry to query
> + *
> + * This is only ever called on PT_ENTRY_TABLE entries. Must return the same
> + * value passed to pt_install_table().
> + */
> +static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts);
> +
> +/**
> + * pt_table_ptr() - Return a CPU pointer for a table item
> + * @pts: Entry to query
> + *
> + * Same as pt_table_pa() but returns a CPU pointer.
> + */
> +static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts)
> +{
> +     return __va(pt_table_pa(pts));
> +}
> +
> +/**
> + * pt_load_entry() - Read from the location pts points at into the pts
> + * @pts: Table index to load
> + *
> + * Set the type of entry that was loaded. pts->entry and pts->table_lower
> + * will be filled in with the entry's content.
> + */
> +static inline void pt_load_entry(struct pt_state *pts)
> +{
> +     pts->type = pt_load_entry_raw(pts);
> +     if (pts->type == PT_ENTRY_TABLE)
> +             pts->table_lower = pt_table_ptr(pts);
> +}
> +#endif

> diff --git a/drivers/iommu/generic_pt/pt_defs.h 
> b/drivers/iommu/generic_pt/pt_defs.h
> new file mode 100644
> index 00000000000000..3673566708495d
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/pt_defs.h
> @@ -0,0 +1,323 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + *
> + * This header is included before the format. It contains definitions
> + * that are required to compile the format. The header order is:
> + *  pt_defs.h
> + *  fmt_XX.h
> + *  pt_common.h
> + */
> +#ifndef __GENERIC_PT_DEFS_H
> +#define __GENERIC_PT_DEFS_H
> +
> +#include <linux/generic_pt/common.h>
> +
> +#include <linux/types.h>
> +#include <linux/atomic.h>
> +#include <linux/bits.h>
> +#include <linux/limits.h>
> +#include <linux/bug.h>
> +#include <linux/kconfig.h>
> +#include "pt_log2.h"
> +
> +/* Header self-compile default defines */
> +#ifndef pt_write_attrs
> +typedef u64 pt_vaddr_t;
> +typedef u64 pt_oaddr_t;
> +#endif
> +
> +struct pt_table_p;
> +
> +enum {
> +     PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX,
> +     PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32,
> +     PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX,
> +     PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32,
> +};

Hm, duplicated enum entry values?
Interesting.

> +
> +/*
> + * The format instantiation can have features wired off or on to optimize the
> + * code gen. Supported features are just a reflection of what the current 
> set of
> + * kernel users want to use.
> + */
> +#ifndef PT_SUPPORTED_FEATURES
> +#define PT_SUPPORTED_FEATURES 0
> +#endif
> +
> +/*
> + * When in debug mode we compile all formats with all features. This allows 
> the
> + * kunit to test the full matrix. SIGN_EXTEND can't co-exist with 
> DYNAMIC_TOP or
> + * FULL_VA.
> + */
> +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
> +enum {
> +     PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES,
> +     PT_DEBUG_SUPPORTED_FEATURES =
> +             UINT_MAX &
> +             ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ?
> +                       BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) :
> +                       BIT(PT_FEAT_SIGN_EXTEND)),
> +};
> +#undef PT_SUPPORTED_FEATURES
> +#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES
> +#endif
> +
> +#ifndef PT_FORCE_ENABLED_FEATURES
> +#define PT_FORCE_ENABLED_FEATURES 0
> +#endif
> +
> +/**
> + * DOC: Generic Page Table Language
> + *
> + * Language used in Generic Page Table
> + *  va
> + *     The input address to the page table, often the virtual address.
> + *  oa
> + *     The output address from the page table, often the physical address.
> + *  leaf
> + *     An entry that results in an output address. Ie a physical memory addr

                                                      I.e.,

> + *  start/end
> + *     An open range, eg [0,0) refers to no VA.

                         e.g.,

and is a half-open (or right-open) range or interval, not open.

Open would be (0, 0).
Closed would be [0, 0].
I used to think that was "clopen" but now I read that clopen refers
to sets and not intervals.



> + *  start/last
> + *     An inclusive closed range, eg [0,0] refers to the VA 0

                                     e.g.,

> + *  common
> + *     The generic page table container struct pt_common
> + *  level
> + *     The number of table hops from the lowest leaf. Level 0
> + *     is always a table of only leaves of the least significant VA bits. The
> + *     labels used by HW descriptions are never used.
> + *  top_level
> + *     The inclusive highest level of the table. A two level table

                                                       two-level

> + *     has a top level of 1.
> + *  table
> + *     A linear array of entries representing the translation items for that
> + *     level.
> + *  index
> + *     The position in a table of an element: item = table[index]
> + *  item
> + *     A single position in a table
> + *  entry
> + *     A single logical element in a table. If contiguous pages are not
> + *     supported then item and entry are the same thing, otherwise entry 
> refers
> + *     to the all the items that comprise a single contiguous translation.
> + *  item/entry_size
> + *     The number of bytes of VA the table translates for.
> + *     If the item is a table entry then the next table covers
> + *     this size. If the entry is an output address then the
> + *     full OA is: OA | (VA % entry_size)
> + *  contig_count
> + *     The number of consecutive items fused into a single entry.
> + *     item_size * contig_count is the size of that entry's translation.
> + *  lg2
> + *     Indicates the value is encoded as log2, ie 1<<x is the actual value.

                                                  i.e.,

> + *     Normally the compiler is fine to optimize divide and mod with log2 
> values
> + *     automatically when inlining, however if the values are not constant
> + *     expressions it can't. So we do it by hand, we want to avoid 64 bit

                                               hand;                  64-bit

> + *     divmod.
> + */
> +
> +/* Returned by pt_load_entry() and for_each_pt_level_entry() */
> +enum pt_entry_type {
> +     PT_ENTRY_EMPTY,
> +     PT_ENTRY_TABLE,
> +     /* Entry is valid and returns an output address */
> +     PT_ENTRY_OA,
> +};
> +
> +struct pt_range {
> +     struct pt_common *common;
> +     struct pt_table_p *top_table;
> +     pt_vaddr_t va;
> +     pt_vaddr_t last_va;
> +     u8 top_level;
> +     u8 max_vasz_lg2;
> +};
> +
> +/*
> + * Similar to xa_state, this records information about an in progress parse 
> at a

                                                             in-progress

> + * single level.
> + */
> +struct pt_state {
> +     struct pt_range *range;
> +     struct pt_table_p *table;
> +     struct pt_table_p *table_lower;
> +     u64 entry;
> +     enum pt_entry_type type;
> +     unsigned short index;
> +     unsigned short end_index;
> +     u8 level;
> +};
> +
> +#define pt_cur_table(pts, type) ((type *)((pts)->table))
> +
> +/*
> + * Try to install a new table pointer. The locking methodology requires this 
> to
> + * be atomic, multiple threads can race to install a pointer, the losing 
> threads

         atomic;                                        pointer;

> + * will fail the atomic and return false. They should free any memory and
> + * reparse the table level again.
> + */

[snup]


> +/*
> + * The full va (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 
> and
> + * generate a useful defined result. The non fva versions will malfunction at

                                            non-fva

> + * this extreme.
> + */
> +static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2)
> +{
> +     if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
> +             return 0;
> +     return log2_div_t(pt_vaddr_t, a, b_lg2);
> +}
> +
> +static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2)
> +{
> +     if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
> +             return a;
> +     return log2_mod_t(pt_vaddr_t, a, b_lg2);
> +}
> +
> +static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b,
> +                               unsigned int c_lg2)
> +{
> +     if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2)
> +             return true;
> +     return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2);
> +}
> +
> +static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val,
> +                                      unsigned int b_lg2)
> +{
> +     if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
> +             return val;
> +     return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2);
> +}
> +
> +static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int 
> b_lg2)
> +{
> +     if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
> +             return PT_VADDR_MAX;
> +     return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2);
> +}
> +

[snip]

> diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h 
> b/drivers/iommu/generic_pt/pt_fmt_defaults.h
> new file mode 100644
> index 00000000000000..8738008d024b0b
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
> @@ -0,0 +1,193 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + *
> + * Default definitions for formats that don't define these functions.
> + */
> +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H
> +#define __GENERIC_PT_PT_FMT_DEFAULTS_H
> +
> +#include "pt_defs.h"
> +#include <linux/log2.h>
> +
> +/* Header self-compile default defines */
> +#ifndef pt_load_entry_raw
> +#include "fmt/amdv1.h"
> +#endif
> +
> +/*
> + * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and
> + * PT_ITEM_WORD_SIZE. The must be the same at every level excluding the top.

                         They

> + */

[snip]

> diff --git a/drivers/iommu/generic_pt/pt_iter.h 
> b/drivers/iommu/generic_pt/pt_iter.h
> new file mode 100644
> index 00000000000000..abbd243f10d879
> --- /dev/null
> +++ b/drivers/iommu/generic_pt/pt_iter.h
> @@ -0,0 +1,636 @@

[snip]

> +/*
> + * Add index_count_lg2 number of entries to pts's VA and index. The va will 
> be

s/VA/va/ for consistency?
since it ("va") is defined in Generic Page Table Language.

> + * adjusted to the end of the contiguous block if it is currently in the 
> middle.
> + */
> +static inline void _pt_advance(struct pt_state *pts,
> +                            unsigned int index_count_lg2)
> +{
> +     pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
> +                               index_count_lg2);
> +}
> +
> +/**
> + * pt_item_fully_covered() - Check if the item or entry is entirely contained
> + *                           within pts->range
> + * @pts: Iteration State
> + * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or
> + *            pt_entry_oa_lg2sz()
> + *
> + * True if the item is fully enclosed by the pts->range.

 * Return: true if the item ...

> + */
> +static inline bool pt_item_fully_covered(const struct pt_state *pts,
> +                                      unsigned int oasz_lg2)
> +{
> +     struct pt_range *range = pts->range;
> +
> +     /* Range begins at the start of the entry */
> +     if (log2_mod(pts->range->va, oasz_lg2))
> +             return false;
> +
> +     /* Range ends past the end of the entry */
> +     if (!log2_div_eq(range->va, range->last_va, oasz_lg2))
> +             return true;
> +
> +     /* Range ends at the end of the entry */
> +     return log2_mod_eq_max(range->last_va, oasz_lg2);
> +}
> +
> +/**
> + * pt_range_to_index() - Starting index for an iteration
> + * @pts: Iteration State
> + *
> + * Return the starting index for the iteration in pts.

 * Return:

> + */
> +static inline unsigned int pt_range_to_index(const struct pt_state *pts)
> +{
> +     unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
> +
> +     PT_WARN_ON(pts->level > pts->range->top_level);
> +     if (pts->range->top_level == pts->level)
> +             return log2_div(fvalog2_mod(pts->range->va,
> +                                         pts->range->max_vasz_lg2),
> +                             isz_lg2);
> +     return log2_mod(log2_div(pts->range->va, isz_lg2),
> +                     pt_num_items_lg2(pts));
> +}
> +
> +/**
> + * pt_range_to_end_index() - Ending index iteration
> + * @pts: Iteration State
> + *
> + * Return the last index for the iteration in pts.

 * Return:

> + */
> +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
> +{
> +     unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
> +     struct pt_range *range = pts->range;
> +     unsigned int num_entries_lg2;
> +
> +     if (range->va == range->last_va)
> +             return pts->index + 1;
> +
> +     if (pts->range->top_level == pts->level)
> +             return log2_div(fvalog2_mod(pts->range->last_va,
> +                                         pts->range->max_vasz_lg2),
> +                             isz_lg2) +
> +                    1;
> +
> +     num_entries_lg2 = pt_num_items_lg2(pts);
> +
> +     /* last_va falls within this table */
> +     if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2))
> +             return log2_mod(log2_div(pts->range->last_va, isz_lg2),
> +                             num_entries_lg2) +
> +                    1;
> +
> +     return log2_to_int(num_entries_lg2);
> +}
> +
> +static inline void _pt_iter_first(struct pt_state *pts)
> +{
> +     pts->index = pt_range_to_index(pts);
> +     pts->end_index = pt_range_to_end_index(pts);
> +     PT_WARN_ON(pts->index > pts->end_index);
> +}
> +
> +static inline bool _pt_iter_load(struct pt_state *pts)
> +{
> +     if (pts->index >= pts->end_index)
> +             return false;
> +     pt_load_entry(pts);
> +     return true;
> +}
> +
> +/**
> + * pt_next_entry() - Advance pts to the next entry
> + * @pts: Iteration State
> + *
> + * Update pts to go to the next index at this level. If pts is pointing at a
> + * contiguous entry then the index may advance my more than one.

                                                  by

> + */
> +static inline void pt_next_entry(struct pt_state *pts)
> +{
> +     if (pts->type == PT_ENTRY_OA &&
> +         !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0))
> +             _pt_advance(pts, pt_entry_num_contig_lg2(pts));
> +     else
> +             pts->index++;
> +     pt_index_to_va(pts);
> +}
> +
> +/**
> + * for_each_pt_level_entry() - For loop wrapper over entries in the range
> + * @pts: Iteration State
> + *
> + * This is the basic iteration primitive, it iterates over all the entries in

                                  primitive. It

> + * pts->range that fall within the pts's current table level. Each step does
> + * pt_load_entry(pts).
> + */
> +#define for_each_pt_level_entry(pts) \
> +     for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts))
> +


[snip]

> +/*
> + * pt_walk_descend_all() - Recursively invoke the walker for a table item
> + * @pts: Iteration State

 * @parent_pts:

> + * @fn: Walker function to call
> + * @arg: Value to pass to the function
> + *
> + * With pts pointing at a table item this will descend and over the entire 
> lower
> + * table. This creates a new walk and does not alter pts or pts->range.
> + */
> +static __always_inline int
> +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
> +                 void *arg)
> +{


[snip]

> +/**
> + * PT_MAKE_LEVELS() - Build an unwound walker
> + * @fn: Name of the walker function
> + * @do_fn: Function to call at each level
> + *
> + * This builds a function call tree that can be fully inlined,

                                                         inlined.

> + * The caller must provide a function body in an __always_inline function::
> + *

[snip]



> diff --git a/include/linux/generic_pt/common.h 
> b/include/linux/generic_pt/common.h
> new file mode 100644
> index 00000000000000..91869fad33fbdf
> --- /dev/null
> +++ b/include/linux/generic_pt/common.h
> @@ -0,0 +1,134 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
> + */
> +#ifndef __GENERIC_PT_COMMON_H
> +#define __GENERIC_PT_COMMON_H
> +
> +#include <linux/types.h>
> +#include <linux/build_bug.h>
> +#include <linux/bits.h>
> +
> +/**
> + * DOC: Generic Radix Page Table
> + *
> + * Generic Radix Page Table is a set of functions and helpers to efficiently
> + * parse radix style page tables typically seen in HW implementations. The
> + * interface is built to deliver similar code generation as the mm's 
> pte/pmd/etc
> + * system by fully inlining the exact code required to handle each table 
> level.
> + *
> + * Like the MM each format contributes its parsing implementation under 
> common

maybe          MM system,

> + * names and the common code implements the required algorithms.
> + *
> + * The system is divided into three logical levels:
> + *
> + *  - The page table format and its manipulation functions
> + *  - Generic helpers to give a consistent API regardless of underlying 
> format
> + *  - An algorithm implementation (eg IOMMU/DRM/KVM/MM)

                                     (e.g.,

> + *
> + * Multiple implementations are supported, the intention is to have the 
> generic

                                   supported. The

> + * format code be re-usable for whatever specalized implementation is 
> required.
> + * The generic code is solely about the format of the radix tree, it does not

                                                               tree;

> + * include memory allocation or higher level decisions that are left for the
> + * implementation.
> + *
> + * The generic framework supports a superset of functions across many HW
> + * implementations:
> + *
> + *  - Entries comprised of contiguous blocks of IO PTEs for larger page sizes
> + *  - Multi-level tables, up to 6 levels. Runtime selected top level
> + *  - Runtime variable table level size (ARM's concatenated tables)
> + *  - Expandable top level allowing dynamic sizing of table levels
> + *  - Optional leaf entries at any level
> + *  - 32 bit/64 bit virtual and output addresses, using every address bit

         32-bit/64-bit

> + *  - Dirty tracking
> + */
> +
> +/**
> + * struct pt_common

    * struct pt_common - <some short struct description>

> + */
> +struct pt_common {
> +     /**
> +      * @top_of_table: Encodes the table top pointer and the top level in a
> +      * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower
> +      * bits of the aligned table pointer are used for the level.
> +      */
> +     uintptr_t top_of_table;
> +     /**
> +      * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits
> +      * must be zero. This may be less than what the page table format
> +      * supports, but must not be more.
> +      */
> +     u8 max_oasz_lg2;
> +     /**
> +      * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits
> +      * are 0 or 1 depending on pt_full_va_prefix(). This may be less than
> +      * what the page table format supports, but must not be more. When
> +      * PT_FEAT_DYNAMIC_TOP this reflects the maximum VA capability.

           PT_FEAT_DYNAMIC_TOP is set, this reflects ...
?

> +      */
> +     u8 max_vasz_lg2;
> +     /**
> +      * @features: Bitmap of `enum pt_features`
> +      */
> +     unsigned int features;
> +};
> +
> +/* Encoding parameters for top_of_table */
> +enum {
> +     PT_TOP_LEVEL_BITS = 3,
> +     PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0),
> +};
> +
> +/**
> + * enum pt_features - Features turned on in the table. Each symbol is a bit
> + * position.
> + */
> +enum pt_features {
> +     /**
> +      * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to
> +      * PT_VADDR_MAX.
> +      */
> +     PT_FEAT_FULL_VA,
> +     /**
> +      * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased
> +      * dynamically during map. This requires HW support for atomically
> +      * setting both the table top pointer and the starting table level.
> +      */
> +     PT_FEAT_DYNAMIC_TOP,
> +     /**
> +      * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign
> +      * extends up to the full pt_vaddr_t. This divides the page table into
> +      * three VA ranges::
> +      *
> +      *   0         -> 2^N - 1             Lower
> +      *   2^N       -> (MAX - 2^N - 1)     Non-Canonical
> +      *   MAX - 2^N -> MAX                 Upper
> +      *
> +      * In this mode pt_common::max_vasz_lg2 includes the sign bit and the
> +      * upper bits that don't fall within the translation are just validated.
> +      *
> +      * If not set there is no sign extension and valid VA goes from 0 to 2^N
> +      * - 1.
> +      */
> +     PT_FEAT_SIGN_EXTEND,
> +     /**
> +      * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA
> +      * ranges which will clean out any walk cache or any IOPTE fully
> +      * contained by the range. The optimization objective is to minimize the
> +      * number of flushes even if ranges include IOVA gaps that do not need
> +      * to be flushed.
> +      */
> +     PT_FEAT_FLUSH_RANGE,
> +     /**
> +      * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that
> +      * the optimization objective is to only flush IOVA that has been
> +      * changed. This mode is suitable for cases like hypervisor shadowing
> +      * where flushing unchanged ranges may cause the hypervisor to reparse
> +      * significant amount of page table.
> +      */
> +     PT_FEAT_FLUSH_RANGE_NO_GAPS,
> +     /* private: */
> +     PT_FEAT_FMT_START,
> +};
> +
> +#endif
-- 
~Randy



Reply via email to