The main reason I am advocating a set of pagetable_operations is to
enable the development of a new hugetlb interface.  During the hugetlb
BOFS at OLS last year, we talked about a character device that would
behave like /dev/zero.  Many of the people were talking about how they
just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
about the hugetlbfs filesystem.  /dev/zero is a familiar interface for
getting anonymous memory so bringing that model to huge pages would make
programming for anonymous huge pages easier.

The pagetable_operations API opens up possibilities to do some
additional (and completely sane) things.  For example, I have a patch
that alters the character device code below to make use of a hugetlb
ZERO_PAGE.  This eliminates almost all the up-front fault time, allowing
pages to be COW'ed only when first written to.  We cannot do things like
this with hugetlbfs anymore because we have a set of complex semantics
to preserve.

The following patch is an example of what a simple pagetable_operations
consumer could look like.  It does depend on some other cleanups I am
working on (removal of is_file_hugepages(), ...hugetlbfs/inode.c vs.
mm/hugetlb.c separation, etc).  So it is unlikely to apply to any trees
you may have.  I do think it makes a useful illustration of what
legitimate things can be done with a pagetable_operations interface.

commit be72df1c616fb662693a8d4410ce3058f20c71f3
Author: Adam Litke <[EMAIL PROTECTED]>
Date:   Tue Feb 13 14:18:21 2007 -0800

diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index fc11063..c5e755b 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_IPMI_HANDLER)  += ipmi/
 
 obj-$(CONFIG_HANGCHECK_TIMER)  += hangcheck-timer.o
 obj-$(CONFIG_TCG_TPM)          += tpm/
+obj-$(CONFIG_HUGETLB_PAGE)     += page.o
 
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
diff --git a/drivers/char/page.c b/drivers/char/page.c
new file mode 100644
index 0000000..e903028
--- /dev/null
+++ b/drivers/char/page.c
@@ -0,0 +1,133 @@
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+
+static const struct {
+       unsigned int    minor;
+       char            *name;
+       umode_t         mode;
+} devlist[] = {
+       {1, "page-huge", S_IRUGO | S_IWUGO},
+};
+
+static struct page *page_nopage(struct vm_area_struct *vma,
+                       unsigned long address, int *unused)
+{
+       BUG();
+       return NULL;
+}
+
+static struct vm_operations_struct page_vm_ops = {
+       .nopage = page_nopage,
+};
+
+static int page_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, int write_access)
+{
+       pte_t *ptep;
+       pte_t entry, new_entry;
+       int ret;
+       static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+
+       ptep = huge_pte_alloc(mm, address);
+       if (!ptep)
+               return VM_FAULT_OOM;
+
+       mutex_lock(&hugetlb_instantiation_mutex);
+       entry = *ptep;
+       if (pte_none(entry)) {
+               struct page *page;
+
+               page = alloc_huge_page(vma, address);
+               if (!page)
+                       return VM_FAULT_OOM;
+               clear_huge_page(page, address);
+
+               ret = VM_FAULT_MINOR;
+               spin_lock(&mm->page_table_lock);
+               if (!pte_none(*ptep))
+                       goto out;
+               add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+               new_entry = make_huge_pte(vma, page, 0);
+               set_huge_pte_at(mm, address, ptep, new_entry);
+               goto out;
+       }
+
+       spin_lock(&mm->page_table_lock);
+       /* Check for a racing update before calling hugetlb_cow */
+       if (likely(pte_same(entry, *ptep)))
+               if (write_access && !pte_write(entry))
+                       ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+out:
+       spin_unlock(&mm->page_table_lock);
+       mutex_unlock(&hugetlb_instantiation_mutex);
+       return ret;
+}
+
+
+static struct pagetable_operations_struct page_pagetable_ops = {
+       .copy_vma               = copy_hugetlb_page_range,
+       .pin_pages              = follow_hugetlb_page,
+       .unmap_page_range       = unmap_hugepage_range,
+       .change_protection      = hugetlb_change_protection,
+       .free_pgtable_range     = hugetlb_free_pgd_range,
+       .fault                  = page_fault,
+};
+
+static int page_mmap(struct file * file, struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED)
+               return -EINVAL;
+
+       if (vma->vm_pgoff)
+               return -EINVAL;
+
+       if (vma->vm_start & ~HPAGE_MASK)
+               return -EINVAL;
+
+       if (vma->vm_end & ~HPAGE_MASK)
+               return -EINVAL;
+
+       if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+               return -EINVAL;
+
+       vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
+       vma->vm_ops = &page_vm_ops;
+       vma->pagetable_ops = &page_pagetable_ops;
+
+       return 0;
+}
+
+const struct file_operations page_file_operations = {
+       .mmap                   = page_mmap,
+       .get_unmapped_area      = hugetlb_get_unmapped_area,
+       .prepare_unmapped_area  = prepare_hugepage_range,
+};
+
+static struct class *page_class;
+
+static int __init chr_dev_init(void)
+{
+       int major, i;
+
+       printk("Initializing page devices...");
+       major = register_chrdev(0, "page", &page_file_operations);
+       if (major <= 0)
+               printk("failed\n");
+       else
+               printk("(%i:0)\n", major);
+
+       page_class = class_create(THIS_MODULE, "page");
+       for (i = 0; i < ARRAY_SIZE(devlist); i++)
+               class_device_create(page_class, NULL,
+                       MKDEV(major, devlist[i].minor),
+                       NULL, devlist[i].name);
+
+       return 0;
+}
+fs_initcall(chr_dev_init);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4fc0bca..edd4944 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -590,6 +590,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
        BUG_ON(!has_pt_op(vma, fault));
 
+       BUG_ON(!has_pt_op(vma,fault));
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to