The purpose of this series is:

  Waiman Long reported 'pgd_lock' contention on high CPU count systems and 
proposed
  moving pgd_lock on a separate cacheline to eliminate false sharing and to 
reduce
  some of the lock bouncing overhead.

  I think we can do much better: this series eliminates the pgd_list and makes
  pgd_alloc()/pgd_free() lockless.

This is the -v2 submission that addresses all feedback received so far, it
fixes bugs and cleans up details. There's a v1->v2 interdiff attached
further below: most of the changes relate to locking the task before looking
at tsk->mm.

The series is also available in -tip:master for easy testing:

  git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git master

It's not applied to any visible topic tree yet. (Its sha1 is bc40a788fc63
if you want to test it without pending non-x86 bits in -tip.)

Thanks,

    Ingo

====================>
Ingo Molnar (12):
  x86/mm/pat: Don't free PGD entries on memory unmap
  x86/mm/hotplug: Remove pgd_list use from the memory hotplug code
  x86/mm/hotplug: Don't remove PGD entries in remove_pagetable()
  x86/mm/hotplug: Simplify sync_global_pgds()
  mm: Introduce arch_pgd_init_late()
  x86/mm: Enable and use the arch_pgd_init_late() method
  x86/virt/guest/xen: Remove use of pgd_list from the Xen guest code
  x86/mm: Remove pgd_list use from vmalloc_sync_all()
  x86/mm/pat/32: Remove pgd_list use from the PAT code
  x86/mm: Make pgd_alloc()/pgd_free() lockless
  x86/mm: Remove pgd_list leftovers
  x86/mm: Simplify pgd_alloc()

 arch/Kconfig                      |   9 ++++
 arch/x86/Kconfig                  |   1 +
 arch/x86/include/asm/pgtable.h    |   3 --
 arch/x86/include/asm/pgtable_64.h |   3 +-
 arch/x86/mm/fault.c               |  31 +++++++++-----
 arch/x86/mm/init_64.c             |  80 ++++++++++++-----------------------
 arch/x86/mm/pageattr.c            |  41 +++++++++---------
 arch/x86/mm/pgtable.c             | 131 
++++++++++++++++++++++++++++++----------------------------
 arch/x86/xen/mmu.c                |  51 +++++++++++++++++------
 fs/exec.c                         |   3 ++
 include/linux/mm.h                |   6 +++
 kernel/fork.c                     |  16 +++++++
 12 files changed, 209 insertions(+), 166 deletions(-)

-v1 => -v2 interdiff:
====

 arch/x86/mm/fault.c    | 17 +++++++++++------
 arch/x86/mm/init_64.c  | 23 ++++++++++++-----------
 arch/x86/mm/pageattr.c | 27 ++++++++++++++-------------
 arch/x86/mm/pgtable.c  | 22 ++++++++++++----------
 arch/x86/xen/mmu.c     | 47 +++++++++++++++++++++++++----------------------
 fs/exec.c              |  2 +-
 include/linux/mm.h     |  4 ++--
 kernel/fork.c          |  4 ++--
 8 files changed, 79 insertions(+), 67 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index bebdd97f888b..14b39c3511fd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -237,28 +237,33 @@ void vmalloc_sync_all(void)
 
                struct task_struct *g, *p;
 
-               rcu_read_lock();
-               spin_lock(&pgd_lock);
+               spin_lock(&pgd_lock); /* Implies rcu_read_lock(): */
 
                for_each_process_thread(g, p) {
+                       struct mm_struct *mm;
                        spinlock_t *pgt_lock;
                        pmd_t *pmd_ret;
 
-                       if (!p->mm)
+                       task_lock(p);
+                       mm = p->mm;
+                       if (!mm) {
+                               task_unlock(p);
                                continue;
+                       }
 
                        /* The pgt_lock is only used on Xen: */
-                       pgt_lock = &p->mm->page_table_lock;
+                       pgt_lock = &mm->page_table_lock;
                        spin_lock(pgt_lock);
-                       pmd_ret = vmalloc_sync_one(p->mm->pgd, address);
+                       pmd_ret = vmalloc_sync_one(mm->pgd, address);
                        spin_unlock(pgt_lock);
 
+                       task_unlock(p);
+
                        if (!pmd_ret)
                                break;
                }
 
                spin_unlock(&pgd_lock);
-               rcu_read_unlock();
        }
 }
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 730560c4873e..dcb2f45caf0e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -171,28 +171,28 @@ void sync_global_pgds(unsigned long start, unsigned long 
end)
                const pgd_t *pgd_ref = pgd_offset_k(address);
                struct task_struct *g, *p;
 
-               /*
-                * When this function is called after memory hot remove,
-                * pgd_none() already returns true, but only the reference
-                * kernel PGD has been cleared, not the process PGDs.
-                *
-                * So clear the affected entries in every process PGD as well:
-                */
+               /* Only sync (potentially) newly added PGD entries: */
                if (pgd_none(*pgd_ref))
                        continue;
 
-               spin_lock(&pgd_lock);
+               spin_lock(&pgd_lock); /* Implies rcu_read_lock() for the task 
list iteration: */
 
                for_each_process_thread(g, p) {
+                       struct mm_struct *mm;
                        pgd_t *pgd;
                        spinlock_t *pgt_lock;
 
-                       if (!p->mm)
+                       task_lock(p);
+                       mm = p->mm;
+                       if (!mm) {
+                               task_unlock(p);
                                continue;
-                       pgd = p->mm->pgd;
+                       }
+
+                       pgd = mm->pgd;
 
                        /* The pgt_lock is only used by Xen: */
-                       pgt_lock = &p->mm->page_table_lock;
+                       pgt_lock = &mm->page_table_lock;
                        spin_lock(pgt_lock);
 
                        if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
@@ -202,6 +202,7 @@ void sync_global_pgds(unsigned long start, unsigned long 
end)
                                set_pgd(pgd, *pgd_ref);
 
                        spin_unlock(pgt_lock);
+                       task_unlock(p);
                }
                spin_unlock(&pgd_lock);
        }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 93c134fdb398..4ff6a1808f1d 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -440,29 +440,30 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long 
address, pte_t pte)
        if (!SHARED_KERNEL_PMD) {
                struct task_struct *g, *p;
 
-               rcu_read_lock();
+               /* We are holding pgd_lock, which implies rcu_read_lock(): */
 
                for_each_process_thread(g, p) {
+                       struct mm_struct *mm;
                        spinlock_t *pgt_lock;
                        pgd_t *pgd;
                        pud_t *pud;
                        pmd_t *pmd;
 
-                       if (!p->mm)
-                               continue;
+                       task_lock(p);
+                       mm = p->mm;
+                       if (mm) {
+                               pgt_lock = &mm->page_table_lock;
+                               spin_lock(pgt_lock);
 
-                       pgt_lock = &p->mm->page_table_lock;
-                       spin_lock(pgt_lock);
+                               pgd = mm->pgd + pgd_index(address);
+                               pud = pud_offset(pgd, address);
+                               pmd = pmd_offset(pud, address);
+                               set_pte_atomic((pte_t *)pmd, pte);
 
-                       pgd = p->mm->pgd + pgd_index(address);
-                       pud = pud_offset(pgd, address);
-                       pmd = pmd_offset(pud, address);
-                       set_pte_atomic((pte_t *)pmd, pte);
-
-                       spin_unlock(pgt_lock);
+                               spin_unlock(pgt_lock);
+                       }
+                       task_unlock(p);
                }
-
-               rcu_read_unlock();
        }
 #endif
 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index b1bd35f452ef..d7d341e57e33 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -348,15 +348,17 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
  * at the highest, PGD level. Thus any other task extending it
  * will first update the reference PGD, then modify the task PGDs.
  */
-void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd)
+void arch_pgd_init_late(struct mm_struct *mm)
 {
        /*
-        * This is called after a new MM has been made visible
-        * in fork() or exec().
+        * This function is called after a new MM has been made visible
+        * in fork() or exec() via:
+        *
+        *   tsk->mm = mm;
         *
         * This barrier makes sure the MM is visible to new RCU
-        * walkers before we initialize it, so that we don't miss
-        * updates:
+        * walkers before we initialize the pagetables below, so that
+        * we don't miss updates:
         */
        smp_wmb();
 
@@ -365,12 +367,12 @@ void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd)
         * ptes in non-PAE, or shared PMD in PAE), then just copy the
         * references from swapper_pg_dir:
         */
-       if (CONFIG_PGTABLE_LEVELS == 2 ||
+       if ( CONFIG_PGTABLE_LEVELS == 2 ||
            (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
-           CONFIG_PGTABLE_LEVELS == 4) {
+            CONFIG_PGTABLE_LEVELS == 4) {
 
                pgd_t *pgd_src = swapper_pg_dir + KERNEL_PGD_BOUNDARY;
-               pgd_t *pgd_dst =            pgd + KERNEL_PGD_BOUNDARY;
+               pgd_t *pgd_dst =        mm->pgd + KERNEL_PGD_BOUNDARY;
                int i;
 
                for (i = 0; i < KERNEL_PGD_PTRS; i++, pgd_src++, pgd_dst++) {
@@ -387,8 +389,8 @@ void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd)
                         * it won't change from under us. Parallel updates (new
                         * allocations) will modify our (already visible) PGD:
                         */
-                       if (pgd_val(*pgd_src))
-                               WRITE_ONCE(*pgd_dst, *pgd_src);
+                       if (!pgd_none(*pgd_src))
+                               set_pgd(pgd_dst, *pgd_src);
                }
        }
 }
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 87a8354435f8..70a3df5b0b54 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -855,27 +855,28 @@ void xen_mm_pin_all(void)
 {
        struct task_struct *g, *p;
 
-       rcu_read_lock();
-       spin_lock(&pgd_lock);
+       spin_lock(&pgd_lock); /* Implies rcu_read_lock() for the task list 
iteration: */
 
        for_each_process_thread(g, p) {
+               struct mm_struct *mm;
                struct page *page;
                pgd_t *pgd;
 
-               if (!p->mm)
-                       continue;
-
-               pgd = p->mm->pgd;
-               page = virt_to_page(pgd);
+               task_lock(p);
+               mm = p->mm;
+               if (mm) {
+                       pgd = mm->pgd;
+                       page = virt_to_page(pgd);
 
-               if (!PagePinned(page)) {
-                       __xen_pgd_pin(&init_mm, pgd);
-                       SetPageSavePinned(page);
+                       if (!PagePinned(page)) {
+                               __xen_pgd_pin(&init_mm, pgd);
+                               SetPageSavePinned(page);
+                       }
                }
+               task_unlock(p);
        }
 
        spin_unlock(&pgd_lock);
-       rcu_read_unlock();
 }
 
 /*
@@ -980,24 +981,26 @@ void xen_mm_unpin_all(void)
 {
        struct task_struct *g, *p;
 
-       rcu_read_lock();
-       spin_lock(&pgd_lock);
+       spin_lock(&pgd_lock); /* Implies rcu_read_lock() for the task list 
iteration: */
 
        for_each_process_thread(g, p) {
+               struct mm_struct *mm;
                struct page *page;
                pgd_t *pgd;
 
-               if (!p->mm)
-                       continue;
-
-               pgd = p->mm->pgd;
-               page = virt_to_page(pgd);
+               task_lock(p);
+               mm = p->mm;
+               if (mm) {
+                       pgd = mm->pgd;
+                       page = virt_to_page(pgd);
 
-               if (PageSavePinned(page)) {
-                       BUG_ON(!PagePinned(page));
-                       __xen_pgd_unpin(&init_mm, pgd);
-                       ClearPageSavePinned(page);
+                       if (PageSavePinned(page)) {
+                               BUG_ON(!PagePinned(page));
+                               __xen_pgd_unpin(&init_mm, pgd);
+                               ClearPageSavePinned(page);
+                       }
                }
+               task_unlock(p);
        }
 
        spin_unlock(&pgd_lock);
diff --git a/fs/exec.c b/fs/exec.c
index c1d213c64fda..4ce1383d5bba 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -862,7 +862,7 @@ static int exec_mmap(struct mm_struct *mm)
        active_mm = tsk->active_mm;
 
        tsk->mm = mm;
-       arch_pgd_init_late(mm, mm->pgd);
+       arch_pgd_init_late(mm);
 
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 35d887d2b038..a3edc839e431 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1135,9 +1135,9 @@ int generic_access_phys(struct vm_area_struct *vma, 
unsigned long addr,
                        void *buf, int len, int write);
 
 #ifdef CONFIG_ARCH_HAS_PGD_INIT_LATE
-void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd);
+void arch_pgd_init_late(struct mm_struct *mm);
 #else
-static inline void arch_pgd_init_late(struct mm_struct *mm, pgd_t *pgd) { }
+static inline void arch_pgd_init_late(struct mm_struct *mm) { }
 #endif
 
 static inline void unmap_shared_mapping_range(struct address_space *mapping,
diff --git a/kernel/fork.c b/kernel/fork.c
index 1f83ceca6c6c..cfa84971fb52 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1605,8 +1605,8 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
         * NOTE: any user-space parts of the PGD are already initialized
         *       and must not be clobbered.
         */
-       if (p->mm != current->mm)
-               arch_pgd_init_late(p->mm, p->mm->pgd);
+       if (!(clone_flags & CLONE_VM))
+               arch_pgd_init_late(p->mm);
 
        proc_fork_connector(p);
        cgroup_post_fork(p);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to