On 9/11/25 22:15, David Hildenbrand wrote: > On 08.09.25 02:04, Balbir Singh wrote: >> Extend core huge page management functions to handle device-private THP >> entries. This enables proper handling of large device-private folios in >> fundamental MM operations. >> >> The following functions have been updated: >> >> - copy_huge_pmd(): Handle device-private entries during fork/clone >> - zap_huge_pmd(): Properly free device-private THP during munmap >> - change_huge_pmd(): Support protection changes on device-private THP >> - __pte_offset_map(): Add device-private entry awareness >> >> Cc: Andrew Morton <a...@linux-foundation.org> >> Cc: David Hildenbrand <da...@redhat.com> >> Cc: Zi Yan <z...@nvidia.com> >> Cc: Joshua Hahn <joshua.hah...@gmail.com> >> Cc: Rakie Kim <rakie....@sk.com> >> Cc: Byungchul Park <byungc...@sk.com> >> Cc: Gregory Price <gou...@gourry.net> >> Cc: Ying Huang <ying.hu...@linux.alibaba.com> >> Cc: Alistair Popple <apop...@nvidia.com> >> Cc: Oscar Salvador <osalva...@suse.de> >> Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> >> Cc: Baolin Wang <baolin.w...@linux.alibaba.com> >> Cc: "Liam R. Howlett" <liam.howl...@oracle.com> >> Cc: Nico Pache <npa...@redhat.com> >> Cc: Ryan Roberts <ryan.robe...@arm.com> >> Cc: Dev Jain <dev.j...@arm.com> >> Cc: Barry Song <bao...@kernel.org> >> Cc: Lyude Paul <ly...@redhat.com> >> Cc: Danilo Krummrich <d...@kernel.org> >> Cc: David Airlie <airl...@gmail.com> >> Cc: Simona Vetter <sim...@ffwll.ch> >> Cc: Ralph Campbell <rcampb...@nvidia.com> >> Cc: Mika Penttilä <mpent...@redhat.com> >> Cc: Matthew Brost <matthew.br...@intel.com> >> Cc: Francois Dugast <francois.dug...@intel.com> >> >> Signed-off-by: Matthew Brost <matthew.br...@intel.com> >> Signed-off-by: Balbir Singh <balb...@nvidia.com> >> --- >> include/linux/swapops.h | 27 +++++++++++++++++++ >> mm/huge_memory.c | 60 ++++++++++++++++++++++++++++++++++++----- >> mm/pgtable-generic.c | 6 +++++ >> 3 files changed, 86 insertions(+), 7 deletions(-) >> >> diff --git a/include/linux/swapops.h b/include/linux/swapops.h >> index 64ea151a7ae3..59c5889a4d54 100644 >> --- a/include/linux/swapops.h >> +++ b/include/linux/swapops.h >> @@ -594,6 +594,33 @@ static inline int is_pmd_migration_entry(pmd_t pmd) >> } >> #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> +#if defined(CONFIG_ZONE_DEVICE) && >> defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) >> + >> +/** >> + * is_pmd_device_private_entry() - Check if PMD contains a device private >> swap entry >> + * @pmd: The PMD to check >> + * >> + * Returns true if the PMD contains a swap entry that represents a device >> private >> + * page mapping. This is used for zone device private pages that have been >> + * swapped out but still need special handling during various memory >> management >> + * operations. >> + * >> + * Return: 1 if PMD contains device private entry, 0 otherwise >> + */ >> +static inline int is_pmd_device_private_entry(pmd_t pmd) >> +{ >> + return is_swap_pmd(pmd) && >> is_device_private_entry(pmd_to_swp_entry(pmd)); >> +} >> + >> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> + >> +static inline int is_pmd_device_private_entry(pmd_t pmd) >> +{ >> + return 0; >> +} >> + >> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ >> + >> static inline int non_swap_entry(swp_entry_t entry) >> { >> return swp_type(entry) >= MAX_SWAPFILES; >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 26cedfcd7418..2af74e09b279 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -1703,8 +1703,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct >> mm_struct *src_mm, >> if (unlikely(is_swap_pmd(pmd))) { >> swp_entry_t entry = pmd_to_swp_entry(pmd); >> - VM_BUG_ON(!is_pmd_migration_entry(pmd)); >> - if (!is_readable_migration_entry(entry)) { >> + VM_WARN_ON(!is_pmd_migration_entry(pmd) && >> + !is_pmd_device_private_entry(pmd)); >> + > > Wrong indentation.
checkpatch.pl did not complain, what I see is + VM_WARN_ON(!is_pmd_migration_entry(pmd) && + !is_pmd_device_private_entry(pmd)); It looks different in your reply (is your email converting tabs to space? did you want me to align the conditions? + VM_WARN_ON(!is_pmd_migration_entry(pmd) && + !is_pmd_device_private_entry(pmd)); > >> + if (is_migration_entry(entry) && >> + !is_readable_migration_entry(entry)) { > > Dito. > Same as above :) > Wonder if we want to be more explicit. > > if (is_readable_migration_entry(enrty) || > is_readable_exclusive_migration_entry)) { > !is_readable_migration_entry => writable entry or read exclusive, did you mean is_writable_migration_entry() above? > >> entry = make_readable_migration_entry( >> swp_offset(entry)); >> pmd = swp_entry_to_pmd(entry); >> @@ -1713,7 +1716,37 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct >> mm_struct *src_mm, >> if (pmd_swp_uffd_wp(*src_pmd)) >> pmd = pmd_swp_mkuffd_wp(pmd); >> set_pmd_at(src_mm, addr, src_pmd, pmd); >> + } else if (is_device_private_entry(entry)) { >> + /* >> + * For device private entries, since there are no >> + * read exclusive entries, writable = !readable >> + */ >> + if (is_writable_device_private_entry(entry)) { >> + entry = make_readable_device_private_entry( >> + swp_offset(entry)); > > Put this on a single line. > Tried to keep it under 80 columns, but I see that checkpatch now allows upto 100 columns, will fix >> + pmd = swp_entry_to_pmd(entry); >> + >> + if (pmd_swp_soft_dirty(*src_pmd)) >> + pmd = pmd_swp_mksoft_dirty(pmd); >> + if (pmd_swp_uffd_wp(*src_pmd)) >> + pmd = pmd_swp_mkuffd_wp(pmd); >> + set_pmd_at(src_mm, addr, src_pmd, pmd); >> + } >> + >> + src_folio = pfn_swap_entry_folio(entry); >> + VM_WARN_ON(!folio_test_large(src_folio)); >> + >> + folio_get(src_folio); >> + /* >> + * folio_try_dup_anon_rmap_pmd does not fail for >> + * device private entries. >> + */ >> + ret = folio_try_dup_anon_rmap_pmd(src_folio, >> + &src_folio->page, >> + dst_vma, src_vma); >> + VM_WARN_ON(ret); > > Please just drop the ret + VM_WARN_ON here, like we did in the PTE case. > Ack >> } >> + >> add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); >> mm_inc_nr_ptes(dst_mm); >> pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); >> @@ -2211,15 +2244,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct >> vm_area_struct *vma, >> folio_remove_rmap_pmd(folio, page, vma); >> WARN_ON_ONCE(folio_mapcount(folio) < 0); >> VM_BUG_ON_PAGE(!PageHead(page), page); >> - } else if (thp_migration_supported()) { >> + } else if (is_pmd_migration_entry(orig_pmd) || >> + is_pmd_device_private_entry(orig_pmd)) { > > > Indentation ... > Same as the discussion above >> swp_entry_t entry; >> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); >> entry = pmd_to_swp_entry(orig_pmd); >> folio = pfn_swap_entry_folio(entry); >> flush_needed = 0; >> - } else >> - WARN_ONCE(1, "Non present huge pmd without pmd migration >> enabled!"); >> + >> + if (!thp_migration_supported()) >> + WARN_ONCE(1, "Non present huge pmd without pmd migration >> enabled!"); >> + } >> if (folio_test_anon(folio)) { >> zap_deposited_table(tlb->mm, pmd); >> @@ -2239,6 +2274,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct >> vm_area_struct *vma, >> folio_mark_accessed(folio); >> } >> + if (folio_is_device_private(folio)) { >> + folio_remove_rmap_pmd(folio, &folio->page, vma); >> + WARN_ON_ONCE(folio_mapcount(folio) < 0); >> + folio_put(folio); >> + } >> + >> spin_unlock(ptl); >> if (flush_needed) >> tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); >> @@ -2367,7 +2408,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct >> vm_area_struct *vma, >> struct folio *folio = pfn_swap_entry_folio(entry); >> pmd_t newpmd; >> - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); >> + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && >> + !folio_is_device_private(folio)); >> if (is_writable_migration_entry(entry)) { >> /* >> * A protection check is difficult so >> @@ -2380,6 +2422,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct >> vm_area_struct *vma, >> newpmd = swp_entry_to_pmd(entry); >> if (pmd_swp_soft_dirty(*pmd)) >> newpmd = pmd_swp_mksoft_dirty(newpmd); >> + } else if (is_writable_device_private_entry(entry)) { >> + entry = make_readable_device_private_entry( >> + swp_offset(entry)); >> + newpmd = swp_entry_to_pmd(entry); >> } else { >> newpmd = *pmd; >> } >> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c >> index 567e2d084071..604e8206a2ec 100644 >> --- a/mm/pgtable-generic.c >> +++ b/mm/pgtable-generic.c >> @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long >> addr, pmd_t *pmdvalp) >> *pmdvalp = pmdval; >> if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) >> goto nomap; >> + if (is_swap_pmd(pmdval)) { >> + swp_entry_t entry = pmd_to_swp_entry(pmdval); >> + >> + if (is_device_private_entry(entry)) >> + goto nomap; >> + } > > Couldn't we do here > > if (!pmd_present(pmdval)) > goto nomap; > > To replace the original pmd_none() .. check. > > A page table must always be present IIRC. > I am not sure about the pmd_none(), a page table may not be present, I've not audited the callers. But I think we can do if (unlikely(pmd_none(pmdval)) || !pmd_present(pmdval)) goto nomap; and remove the migration and device private entry checks Balbir