From: Srikar Dronamraju <sri...@linux.vnet.ibm.com> Date: Tue, 30 Apr 2013 01:18:08 -0500 Subject: [PATCH 1/2] numa: Track last pid accessing a page.
This change is mostly extracted from ff2a9f9: numa, mm, sched: Implement last-CPU+PID hash tracking from tip/numa/core. We rely on the page::last_nid field (embedded in remaining bits of the page flags field), to drive NUMA placement: the last_nid gives us information about which tasks access memory on what node. Lets consider a page is mostly a private page i.e accessed mostly by one task. If such a task is being moved to a different node, then move the page on the first access from the new node. The cost is 8 more bits used from the page flags - this space is still available on 64-bit systems. There is the potential of false sharing if the PIDs of two tasks are equal modulo 256 - this degrades the statistics somewhat but does not completely eliminate it. Related tasks are typically launched close to each other. Cc: Peter Zijlstra <a.p.zijls...@chello.nl> Cc: Andrea Arcangeli <aarca...@redhat.com> Cc: Rik van Riel <r...@redhat.com> Cc: Mel Gorman <mgor...@suse.de> Originally-from: Ingo Molnar <mi...@kernel.org> Signed-off-by: Srikar Dronamraju <sri...@linux.vnet.ibm.com> --- include/linux/mm.h | 72 ++++++++++++++++++++++++------------- include/linux/mm_types.h | 4 +- include/linux/page-flags-layout.h | 25 ++++++++----- mm/huge_memory.c | 2 +- mm/memory.c | 4 +- mm/mempolicy.c | 20 ++++++++--- mm/migrate.c | 4 +- mm/mm_init.c | 10 +++--- mm/mmzone.c | 14 ++++---- mm/page_alloc.c | 4 +- 10 files changed, 99 insertions(+), 60 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e2091b8..2e3a3db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -582,11 +582,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) +#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -596,7 +596,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) +#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -618,7 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) +#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -662,51 +662,73 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -static inline int page_nid_xchg_last(struct page *page, int nid) + +static inline int nidpid_to_nid(int nidpid) { - return xchg(&page->_last_nid, nid); + return (nidpid >> NIDPID_PID_BITS) & NIDPID_NID_MASK; } -static inline int page_nid_last(struct page *page) +static inline int nidpid_to_pid(int nidpid) { - return page->_last_nid; + return nidpid & NIDPID_PID_MASK; } -static inline void page_nid_reset_last(struct page *page) + +static inline int nid_pid_to_nidpid(int nid, int pid) { - page->_last_nid = -1; + return ((nid & NIDPID_NID_MASK) << NIDPID_PID_BITS) | (pid & NIDPID_PID_MASK); } -#else -static inline int page_nid_last(struct page *page) + +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +static inline int page_xchg_last_nidpid(struct page *page, int nidpid) { - return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; + return xchg(&page->_last_nidpid, nidpid); } -extern int page_nid_xchg_last(struct page *page, int nid); - -static inline void page_nid_reset_last(struct page *page) +static inline int page_last_nidpid(struct page *page) { - int nid = (1 << LAST_NID_SHIFT) - 1; + return page->_last_nidpid; +} - page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; +static inline void reset_page_last_nidpid(struct page *page) +{ + page->_last_nidpid = -1; } -#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ + #else -static inline int page_nid_xchg_last(struct page *page, int nid) + +extern int page_xchg_last_nidpid(struct page *page, int nidpid); +static inline int page_last_nidpid(struct page *page) +{ + return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; +} + +static inline void reset_page_last_nidpid(struct page *page) +{ + page_xchg_last_nidpid(page, -1); +} +#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ + +static inline int page_last_pid(struct page *page) +{ + return nidpid_to_pid(page_last_nidpid(page)); +} + +#else /* !CONFIG_NUMA_BALANCING: */ +static inline int page_xchg_last_nidpid(struct page *page, int cpu) { return page_to_nid(page); } -static inline int page_nid_last(struct page *page) +static inline int page_last_nidpid(struct page *page) { return page_to_nid(page); } -static inline void page_nid_reset_last(struct page *page) +static inline void reset_page_last_nidpid(struct page *page) { } -#endif + +#endif /* !CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ace9a5f..ccb20b9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -174,8 +174,8 @@ struct page { void *shadow; #endif -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS - int _last_nid; +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS + int _last_nidpid; #endif } /* diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 93506a1..c17279a 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -39,9 +39,9 @@ * lookup is necessary. * * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | + * " plus space for last_nid: | NODE | ZONE | LAST_NIDPID ... | FLAGS | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | + * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -61,16 +61,23 @@ #define NODES_WIDTH 0 #endif +/* Reduce false sharing: */ +#define NIDPID_PID_BITS 8 +#define NIDPID_PID_MASK ((1 << NIDPID_PID_BITS)-1) + +#define NIDPID_NID_BITS NODES_SHIFT +#define NIDPID_NID_MASK ((1 << NIDPID_NID_BITS)-1) + #ifdef CONFIG_NUMA_BALANCING -#define LAST_NID_SHIFT NODES_SHIFT +# define LAST_NIDPID_SHIFT (NIDPID_NID_BITS+NIDPID_PID_BITS) #else -#define LAST_NID_SHIFT 0 +# define LAST_NIDPID_SHIFT 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define LAST_NID_WIDTH LAST_NID_SHIFT +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +# define LAST_NIDPID_WIDTH LAST_NIDPID_SHIFT #else -#define LAST_NID_WIDTH 0 +# define LAST_NIDPID_WIDTH 0 #endif /* @@ -81,8 +88,8 @@ #define NODE_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 -#define LAST_NID_NOT_IN_PAGE_FLAGS +#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0 +# define LAST_NIDPID_NOT_IN_PAGE_FLAGS #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2f7f5a..798297a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1639,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_nid_xchg_last(page_tail, page_nid_last(page)); + page_xchg_last_nidpid(page_tail, page_last_nidpid(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/memory.c b/mm/memory.c index ba94dec..e819b3e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,8 +69,8 @@ #include "internal.h" -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA config, growing page-frame for last_nidpid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7431001..4aa64dd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2286,11 +2286,13 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long BUG(); } +#ifdef CONFIG_NUMA_BALANCING /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; + int last_nidpid, this_nidpid; polnid = numa_node_id(); + this_nidpid = nid_pid_to_nidpid(polnid, current->pid); /* * Multi-stage node selection is used in conjunction @@ -2313,11 +2315,19 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_nid_xchg_last(page, polnid); - if (last_nid != polnid) - goto out; + last_nidpid = page_xchg_last_nidpid(page, this_nidpid); + if (curnid != polnid) { + int last_pid = nidpid_to_pid(last_nidpid); + int this_pid = current->pid & NIDPID_PID_MASK; + + /* Freshly allocated pages not accessed by anyone else yet: */ + if (last_pid == this_pid || last_pid == -1 || + (nidpid_to_nid(last_nidpid) == polnid)) + ret = polnid; + } + goto out; } - +#endif if (curnid != polnid) ret = polnid; out: diff --git a/mm/migrate.c b/mm/migrate.c index 3bbaf5d..74fcd76 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1478,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_nid_xchg_last(newpage, page_nid_last(page)); + page_xchg_last_nidpid(newpage, page_last_nidpid(page)); return newpage; } @@ -1660,7 +1660,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_nid_xchg_last(new_page, page_nid_last(page)); + page_xchg_last_nidpid(new_page, page_last_nidpid(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { diff --git a/mm/mm_init.c b/mm/mm_init.c index c280a02..0a0c0d3 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -69,26 +69,26 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NIDPID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", "Section %d Node %d Zone %d Lastnid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, - LAST_NID_WIDTH, + LAST_NIDPID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", "Section %d Node %d Zone %d Lastnid %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_NID_SHIFT); + LAST_NIDPID_SHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", "Section %lu Node %lu Zone %lu Lastnid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_NID_PGSHIFT); + (unsigned long)LAST_NIDPID_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), @@ -100,7 +100,7 @@ void __init mminit_verify_pageflags_layout(void) mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Last nid not in page flags"); #endif diff --git a/mm/mmzone.c b/mm/mmzone.c index 2ac0afb..a9958a1 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) INIT_LIST_HEAD(&lruvec->lists[lru]); } -#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) -int page_nid_xchg_last(struct page *page, int nid) +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NIDPID_NOT_IN_PAGE_FLAGS) +extern int page_xchg_last_nidpid(struct page *page, int nidpid) { unsigned long old_flags, flags; - int last_nid; + int last_nidpid; do { old_flags = flags = page->flags; - last_nid = page_nid_last(page); + last_nidpid = (flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; - flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); + flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); - return last_nid; + return last_nidpid; } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fcced7..d4d0540 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -613,7 +613,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - page_nid_reset_last(page); + reset_page_last_nidpid(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -3910,7 +3910,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); - page_nid_reset_last(page); + reset_page_last_nidpid(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for -- 1.7.1 -- Thanks and Regards Srikar Dronamraju -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/