On 18.02.25 04:55, Alistair Popple wrote:
Currently fs dax pages are considered free when the refcount drops to
one and their refcounts are not increased when mapped via PTEs or
decreased when unmapped. This requires special logic in mm paths to
detect that these pages should not be properly refcounted, and to
detect when the refcount drops to one instead of zero.
On the other hand get_user_pages(), etc. will properly refcount fs dax
pages by taking a reference and dropping it when the page is
unpinned.
Tracking this special behaviour requires extra PTE bits
(eg. pte_devmap) and introduces rules that are potentially confusing
and specific to FS DAX pages. To fix this, and to possibly allow
removal of the special PTE bits in future, convert the fs dax page
refcounts to be zero based and instead take a reference on the page
each time it is mapped as is currently the case for normal pages.
This may also allow a future clean-up to remove the pgmap refcounting
that is currently done in mm/gup.c.
Signed-off-by: Alistair Popple <apop...@nvidia.com>
Reviewed-by: Dan Williams <dan.j.willi...@intel.com>
A couple of nits (sorry that I didn't manage to review the whole thing
the last time, I am a slow reviewer ...). Likely that can all be
adjsuted on top, no need for a full resend IMHO.
index 6674540..cf96f3d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *entry)
return xa_to_value(entry) >> DAX_SHIFT;
}
+static struct folio *dax_to_folio(void *entry)
+{
+ return page_folio(pfn_to_page(dax_to_pfn(entry)));
Nit: return pfn_folio(dax_to_pfn(entry));
+}
+
[...]
-static inline unsigned long dax_folio_share_put(struct folio *folio)
+static inline unsigned long dax_folio_put(struct folio *folio)
{
- return --folio->page.share;
+ unsigned long ref;
+ int order, i;
+
+ if (!dax_folio_is_shared(folio))
+ ref = 0;
+ else
+ ref = --folio->share;
+
out of interest, what synchronizes access to folio->share?
+ if (ref)
+ return ref;
+
+ folio->mapping = NULL;
+ order = folio_order(folio);
+ if (!order)
+ return 0;
+
+ for (i = 0; i < (1UL << order); i++) {
+ struct dev_pagemap *pgmap = page_pgmap(&folio->page);
+ struct page *page = folio_page(folio, i);
+ struct folio *new_folio = (struct folio *)page;
+
+ ClearPageHead(page);
+ clear_compound_head(page);
+
+ new_folio->mapping = NULL;
+ /*
+ * Reset pgmap which was over-written by
+ * prep_compound_page().
+ */
+ new_folio->pgmap = pgmap;
+ new_folio->share = 0;
+ WARN_ON_ONCE(folio_ref_count(new_folio));
+ }
+
+ return ref;
+}
+
+static void dax_folio_init(void *entry)
+{
+ struct folio *folio = dax_to_folio(entry);
+ int order = dax_entry_order(entry);
+
+ /*
+ * Folio should have been split back to order-0 pages in
+ * dax_folio_put() when they were removed from their
+ * final mapping.
+ */
+ WARN_ON_ONCE(folio_order(folio));
+
+ if (order > 0) {
+ prep_compound_page(&folio->page, order);
+ if (order > 1)
+ INIT_LIST_HEAD(&folio->_deferred_list);
Nit: prep_compound_page() -> prep_compound_head() should be taking care
of initializing all folio fields already, so this very likely can be
dropped.
+ WARN_ON_ONCE(folio_ref_count(folio));
+ }
}
[...]
}
@@ -1808,7 +1843,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
- int err = 0;
+ struct folio *folio;
+ int ret, err = 0;
pfn_t pfn;
void *kaddr;
@@ -1840,17 +1876,19 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
return dax_fault_return(err);
}
+ folio = dax_to_folio(*entry);
if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);
- /* insert PMD pfn */
+ folio_ref_inc(folio);
Why is that not a folio_get() ? Could the refcount be 0? Might deserve a
comment then.
if (pmd)
- return vmf_insert_pfn_pmd(vmf, pfn, write);
+ ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)),
+ write);
+ else
+ ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write);
+ folio_put(folio);
- /* insert PTE pfn */
- if (write)
- return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+ return ret;
}
static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
@@ -2089,6 +2127,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn,
unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ struct folio *folio;
void *entry;
vm_fault_t ret;
@@ -2106,14 +2145,17 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
dax_lock_entry(&xas, entry);
xas_unlock_irq(&xas);
+ folio = pfn_folio(pfn_t_to_pfn(pfn));
+ folio_ref_inc(folio);
Same thought.
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2333c30..dcc9fcd 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -209,7 +209,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool
*did_zero,
[...]
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d189826..1a0d6a8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2225,7 +2225,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct
vm_area_struct *vma,
tlb->fullmm);
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
I wonder if we actually want to remove the vma_is_dax() check from
vma_is_special_huge(), and instead add it to the remaining callers of
vma_is_special_huge() that still need it -- if any need it.
Did we sanity-check which callers of vma_is_special_huge() still need
it? Is there still reason to have that DAX check in vma_is_special_huge()?
But vma_is_special_huge() is rather confusing from me ... the whole
vma_is_special_huge() thing should probably be removed. That's a cleanup
for another day, though.
--
Cheers,
David / dhildenb