On Fri, Jan 10, 2014 at 01:10:39PM -0500, Johannes Weiner wrote:
> shmem mappings already contain exceptional entries where swap slot
> information is remembered.
> 
> To be able to store eviction information for regular page cache,
> prepare every site dealing with the radix trees directly to handle
> entries other than pages.
> 
> The common lookup functions will filter out non-page entries and
> return NULL for page cache holes, just as before.  But provide a raw
> version of the API which returns non-page entries as well, and switch
> shmem over to use it.
> 
> Signed-off-by: Johannes Weiner <han...@cmpxchg.org>
Reviewed-by: Minchan Kim <minc...@kernel.org>

Below are just nitpicks.

> ---
>  fs/btrfs/compression.c   |   2 +-
>  include/linux/mm.h       |   8 ++
>  include/linux/pagemap.h  |  15 ++--
>  include/linux/pagevec.h  |   3 +
>  include/linux/shmem_fs.h |   1 +
>  mm/filemap.c             | 196 
> +++++++++++++++++++++++++++++++++++++++++------
>  mm/mincore.c             |  20 +++--
>  mm/readahead.c           |   2 +-
>  mm/shmem.c               |  97 +++++------------------
>  mm/swap.c                |  47 ++++++++++++
>  mm/truncate.c            |  73 ++++++++++++++----
>  11 files changed, 336 insertions(+), 128 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 6aad98cb343f..c88316587900 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
>               rcu_read_lock();
>               page = radix_tree_lookup(&mapping->page_tree, pg_index);
>               rcu_read_unlock();
> -             if (page) {
> +             if (page && !radix_tree_exceptional_entry(page)) {
>                       misses++;
>                       if (misses > 4)
>                               break;
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8b6e55ee8855..c09ef3ae55bc 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -906,6 +906,14 @@ extern void show_free_areas(unsigned int flags);
>  extern bool skip_free_areas_node(unsigned int flags, int nid);
>  
>  int shmem_zero_setup(struct vm_area_struct *);
> +#ifdef CONFIG_SHMEM
> +bool shmem_mapping(struct address_space *mapping);
> +#else
> +static inline bool shmem_mapping(struct address_space *mapping)
> +{
> +     return false;
> +}
> +#endif
>  
>  extern int can_do_mlock(void);
>  extern int user_shm_lock(size_t, struct user_struct *);
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index c73130c607c4..b6854b7c58cb 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -248,12 +248,15 @@ pgoff_t page_cache_next_hole(struct address_space 
> *mapping,
>  pgoff_t page_cache_prev_hole(struct address_space *mapping,
>                            pgoff_t index, unsigned long max_scan);
>  
> -extern struct page * find_get_page(struct address_space *mapping,
> -                             pgoff_t index);
> -extern struct page * find_lock_page(struct address_space *mapping,
> -                             pgoff_t index);
> -extern struct page * find_or_create_page(struct address_space *mapping,
> -                             pgoff_t index, gfp_t gfp_mask);
> +struct page *__find_get_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_get_page(struct address_space *mapping, pgoff_t offset);
> +struct page *__find_lock_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset);
> +struct page *find_or_create_page(struct address_space *mapping, pgoff_t 
> index,
> +                              gfp_t gfp_mask);
> +unsigned __find_get_pages(struct address_space *mapping, pgoff_t start,
> +                       unsigned int nr_pages, struct page **pages,
> +                       pgoff_t *indices);
>  unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
>                       unsigned int nr_pages, struct page **pages);
>  unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
> diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
> index e4dbfab37729..3c6b8b1e945b 100644
> --- a/include/linux/pagevec.h
> +++ b/include/linux/pagevec.h
> @@ -22,6 +22,9 @@ struct pagevec {
>  
>  void __pagevec_release(struct pagevec *pvec);
>  void __pagevec_lru_add(struct pagevec *pvec);
> +unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space 
> *mapping,
> +                       pgoff_t start, unsigned nr_pages, pgoff_t *indices);
> +void pagevec_remove_exceptionals(struct pagevec *pvec);
>  unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
>               pgoff_t start, unsigned nr_pages);
>  unsigned pagevec_lookup_tag(struct pagevec *pvec,
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 30aa0dc60d75..deb49609cd36 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name,
>                                       loff_t size, unsigned long flags);
>  extern int shmem_zero_setup(struct vm_area_struct *);
>  extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
> +extern bool shmem_mapping(struct address_space *mapping);
>  extern void shmem_unlock_mapping(struct address_space *mapping);
>  extern struct page *shmem_read_mapping_page_gfp(struct address_space 
> *mapping,
>                                       pgoff_t index, gfp_t gfp_mask);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 0746b7a4658f..23eb3be27205 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -446,6 +446,29 @@ int replace_page_cache_page(struct page *old, struct 
> page *new, gfp_t gfp_mask)
>  }
>  EXPORT_SYMBOL_GPL(replace_page_cache_page);
>  
> +static int page_cache_tree_insert(struct address_space *mapping,
> +                               struct page *page)
> +{
> +     void **slot;
> +     int error;
> +
> +     slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
> +     if (slot) {
> +             void *p;
> +
> +             p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
> +             if (!radix_tree_exceptional_entry(p))
> +                 return -EEXIST;
> +             radix_tree_replace_slot(slot, page);
> +             mapping->nrpages++;
> +             return 0;
> +     }
> +     error = radix_tree_insert(&mapping->page_tree, page->index, page);
> +     if (!error)
> +             mapping->nrpages++;
> +     return error;
> +}
> +
>  /**
>   * add_to_page_cache_locked - add a locked page to the pagecache
>   * @page:    page to add
> @@ -480,11 +503,10 @@ int add_to_page_cache_locked(struct page *page, struct 
> address_space *mapping,
>       page->index = offset;
>  
>       spin_lock_irq(&mapping->tree_lock);
> -     error = radix_tree_insert(&mapping->page_tree, offset, page);
> +     error = page_cache_tree_insert(mapping, page);
>       radix_tree_preload_end();
>       if (unlikely(error))
>               goto err_insert;
> -     mapping->nrpages++;
>       __inc_zone_page_state(page, NR_FILE_PAGES);
>       spin_unlock_irq(&mapping->tree_lock);
>       trace_mm_filemap_add_to_page_cache(page);
> @@ -712,7 +734,10 @@ pgoff_t page_cache_next_hole(struct address_space 
> *mapping,
>       unsigned long i;
>  
>       for (i = 0; i < max_scan; i++) {
> -             if (!radix_tree_lookup(&mapping->page_tree, index))
> +             struct page *page;
> +
> +             page = radix_tree_lookup(&mapping->page_tree, index);
> +             if (!page || radix_tree_exceptional_entry(page))
>                       break;
>               index++;
>               if (index == 0)
> @@ -750,7 +775,10 @@ pgoff_t page_cache_prev_hole(struct address_space 
> *mapping,
>       unsigned long i;
>  
>       for (i = 0; i < max_scan; i++) {
> -             if (!radix_tree_lookup(&mapping->page_tree, index))
> +             struct page *page;
> +
> +             page = radix_tree_lookup(&mapping->page_tree, index);
> +             if (!page || radix_tree_exceptional_entry(page))
>                       break;
>               index--;
>               if (index == ULONG_MAX)
> @@ -762,14 +790,19 @@ pgoff_t page_cache_prev_hole(struct address_space 
> *mapping,
>  EXPORT_SYMBOL(page_cache_prev_hole);
>  
>  /**
> - * find_get_page - find and get a page reference
> + * __find_get_page - find and get a page reference
>   * @mapping: the address_space to search
>   * @offset: the page index
>   *
> - * Is there a pagecache struct page at the given (mapping, offset) tuple?
> - * If yes, increment its refcount and return it; if no, return NULL.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned with an increased refcount.
> + *
> + * If the slot holds a shadow entry of a previously evicted page, it
> + * is returned.
> + *
> + * Otherwise, %NULL is returned.
>   */
> -struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
> +struct page *__find_get_page(struct address_space *mapping, pgoff_t offset)
>  {
>       void **pagep;
>       struct page *page;
> @@ -810,24 +843,49 @@ out:
>  
>       return page;
>  }
> +EXPORT_SYMBOL(__find_get_page);
> +
> +/**
> + * find_get_page - find and get a page reference
> + * @mapping: the address_space to search
> + * @offset: the page index
> + *
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned with an increased refcount.
> + *
> + * Otherwise, %NULL is returned.
> + */
> +struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
> +{
> +     struct page *page = __find_get_page(mapping, offset);
> +
> +     if (radix_tree_exceptional_entry(page))
> +             page = NULL;
> +     return page;
> +}
>  EXPORT_SYMBOL(find_get_page);
>  
>  /**
> - * find_lock_page - locate, pin and lock a pagecache page
> + * __find_lock_page - locate, pin and lock a pagecache page
>   * @mapping: the address_space to search
>   * @offset: the page index
>   *
> - * Locates the desired pagecache page, locks it, increments its reference
> - * count and returns its address.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
> + *
> + * If the slot holds a shadow entry of a previously evicted page, it
> + * is returned.
> + *
> + * Otherwise, %NULL is returned.
>   *
> - * Returns zero if the page was not present. find_lock_page() may sleep.
> + * __find_lock_page() may sleep.
>   */
> -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
> +struct page *__find_lock_page(struct address_space *mapping, pgoff_t offset)
>  {
>       struct page *page;
> -
>  repeat:
> -     page = find_get_page(mapping, offset);
> +     page = __find_get_page(mapping, offset);
>       if (page && !radix_tree_exception(page)) {
>               lock_page(page);
>               /* Has the page been truncated? */
> @@ -840,6 +898,29 @@ repeat:
>       }
>       return page;
>  }
> +EXPORT_SYMBOL(__find_lock_page);
> +
> +/**
> + * find_lock_page - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + *
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
> + *
> + * Otherwise, %NULL is returned.
> + *
> + * find_lock_page() may sleep.
> + */
> +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
> +{
> +     struct page *page = __find_lock_page(mapping, offset);
> +
> +     if (radix_tree_exceptional_entry(page))
> +             page = NULL;
> +     return page;
> +}
>  EXPORT_SYMBOL(find_lock_page);
>  
>  /**
> @@ -848,16 +929,18 @@ EXPORT_SYMBOL(find_lock_page);
>   * @index: the page's index into the mapping
>   * @gfp_mask: page allocation mode
>   *
> - * Locates a page in the pagecache.  If the page is not present, a new page
> - * is allocated using @gfp_mask and is added to the pagecache and to the VM's
> - * LRU list.  The returned page is locked and has its reference count
> - * incremented.
> + * Looks up the page cache slot at @mapping & @offset.  If there is a
> + * page cache page, it is returned locked and with an increased
> + * refcount.
>   *
> - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
> - * allocation!
> + * If the page is not present, a new page is allocated using @gfp_mask
> + * and added to the page cache and the VM's LRU list.  The page is
> + * returned locked and with an increased refcount.
>   *
> - * find_or_create_page() returns the desired page's address, or zero on
> - * memory exhaustion.
> + * On memory exhaustion, %NULL is returned.
> + *
> + * find_or_create_page() may sleep, even if @gfp_flags specifies an
> + * atomic allocation!
>   */
>  struct page *find_or_create_page(struct address_space *mapping,
>               pgoff_t index, gfp_t gfp_mask)
> @@ -890,6 +973,73 @@ repeat:
>  EXPORT_SYMBOL(find_or_create_page);
>  
>  /**
> + * __find_get_pages - gang pagecache lookup
> + * @mapping: The address_space to search
> + * @start:   The starting page index
> + * @nr_pages:        The maximum number of pages
> + * @pages:   Where the resulting pages are placed

where is @indices?

> + *
> + * __find_get_pages() will search for and return a group of up to
> + * @nr_pages pages in the mapping.  The pages are placed at @pages.
> + * __find_get_pages() takes a reference against the returned pages.
> + *
> + * The search returns a group of mapping-contiguous pages with ascending
> + * indexes.  There may be holes in the indices due to not-present pages.
> + *
> + * Any shadow entries of evicted pages are included in the returned
> + * array.
> + *
> + * __find_get_pages() returns the number of pages and shadow entries
> + * which were found.
> + */
> +unsigned __find_get_pages(struct address_space *mapping,
> +                       pgoff_t start, unsigned int nr_pages,
> +                       struct page **pages, pgoff_t *indices)
> +{
> +     void **slot;
> +     unsigned int ret = 0;
> +     struct radix_tree_iter iter;
> +
> +     if (!nr_pages)
> +             return 0;
> +
> +     rcu_read_lock();
> +restart:
> +     radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> +             struct page *page;
> +repeat:
> +             page = radix_tree_deref_slot(slot);
> +             if (unlikely(!page))
> +                     continue;
> +             if (radix_tree_exception(page)) {
> +                     if (radix_tree_deref_retry(page))
> +                             goto restart;
> +                     /*
> +                      * Otherwise, we must be storing a swap entry
> +                      * here as an exceptional entry: so return it
> +                      * without attempting to raise page count.
> +                      */
> +                     goto export;
> +             }
> +             if (!page_cache_get_speculative(page))
> +                     goto repeat;
> +
> +             /* Has the page moved? */
> +             if (unlikely(page != *slot)) {
> +                     page_cache_release(page);
> +                     goto repeat;
> +             }
> +export:
> +             indices[ret] = iter.index;
> +             pages[ret] = page;
> +             if (++ret == nr_pages)
> +                     break;
> +     }
> +     rcu_read_unlock();
> +     return ret;
> +}
> +
> +/**
>   * find_get_pages - gang pagecache lookup
>   * @mapping: The address_space to search
>   * @start:   The starting page index
> diff --git a/mm/mincore.c b/mm/mincore.c
> index da2be56a7b8f..ad411ec86a55 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space 
> *mapping, pgoff_t pgoff)
>        * any other file mapping (ie. marked !present and faulted in with
>        * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
>        */
> -     page = find_get_page(mapping, pgoff);
>  #ifdef CONFIG_SWAP
> -     /* shmem/tmpfs may return swap: account for swapcache page too. */
> -     if (radix_tree_exceptional_entry(page)) {
> -             swp_entry_t swap = radix_to_swp_entry(page);
> -             page = find_get_page(swap_address_space(swap), swap.val);
> -     }
> +     if (shmem_mapping(mapping)) {
> +             page = __find_get_page(mapping, pgoff);
> +             /*
> +              * shmem/tmpfs may return swap: account for swapcache
> +              * page too.
> +              */
> +             if (radix_tree_exceptional_entry(page)) {
> +                     swp_entry_t swp = radix_to_swp_entry(page);
> +                     page = find_get_page(swap_address_space(swp), swp.val);
> +             }
> +     } else
> +             page = find_get_page(mapping, pgoff);
> +#else
> +     page = find_get_page(mapping, pgoff);
>  #endif
>       if (page) {
>               present = PageUptodate(page);
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 9eeeeda4ac0e..912c00358112 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -179,7 +179,7 @@ __do_page_cache_readahead(struct address_space *mapping, 
> struct file *filp,
>               rcu_read_lock();
>               page = radix_tree_lookup(&mapping->page_tree, page_offset);
>               rcu_read_unlock();
> -             if (page)
> +             if (page && !radix_tree_exceptional_entry(page))
>                       continue;
>  
>               page = page_cache_alloc_readahead(mapping);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 7c67249d6f28..1f4b65f7b831 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -329,56 +329,6 @@ static void shmem_delete_from_page_cache(struct page 
> *page, void *radswap)
>  }
>  
>  /*
> - * Like find_get_pages, but collecting swap entries as well as pages.
> - */
> -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
> -                                     pgoff_t start, unsigned int nr_pages,
> -                                     struct page **pages, pgoff_t *indices)
> -{
> -     void **slot;
> -     unsigned int ret = 0;
> -     struct radix_tree_iter iter;
> -
> -     if (!nr_pages)
> -             return 0;
> -
> -     rcu_read_lock();
> -restart:
> -     radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> -             struct page *page;
> -repeat:
> -             page = radix_tree_deref_slot(slot);
> -             if (unlikely(!page))
> -                     continue;
> -             if (radix_tree_exception(page)) {
> -                     if (radix_tree_deref_retry(page))
> -                             goto restart;
> -                     /*
> -                      * Otherwise, we must be storing a swap entry
> -                      * here as an exceptional entry: so return it
> -                      * without attempting to raise page count.
> -                      */
> -                     goto export;
> -             }
> -             if (!page_cache_get_speculative(page))
> -                     goto repeat;
> -
> -             /* Has the page moved? */
> -             if (unlikely(page != *slot)) {
> -                     page_cache_release(page);
> -                     goto repeat;
> -             }
> -export:
> -             indices[ret] = iter.index;
> -             pages[ret] = page;
> -             if (++ret == nr_pages)
> -                     break;
> -     }
> -     rcu_read_unlock();
> -     return ret;
> -}
> -
> -/*
>   * Remove swap entry from radix tree, free the swap and its page cache.
>   */
>  static int shmem_free_swap(struct address_space *mapping,
> @@ -396,21 +346,6 @@ static int shmem_free_swap(struct address_space *mapping,
>  }
>  
>  /*
> - * Pagevec may contain swap entries, so shuffle up pages before releasing.
> - */
> -static void shmem_deswap_pagevec(struct pagevec *pvec)
> -{
> -     int i, j;
> -
> -     for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
> -             struct page *page = pvec->pages[i];
> -             if (!radix_tree_exceptional_entry(page))
> -                     pvec->pages[j++] = page;
> -     }
> -     pvec->nr = j;
> -}
> -
> -/*
>   * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
>   */
>  void shmem_unlock_mapping(struct address_space *mapping)
> @@ -428,12 +363,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
>                * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
>                * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
>                */
> -             pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +             pvec.nr = __find_get_pages(mapping, index,
>                                       PAGEVEC_SIZE, pvec.pages, indices);
>               if (!pvec.nr)
>                       break;
>               index = indices[pvec.nr - 1] + 1;
> -             shmem_deswap_pagevec(&pvec);
> +             pagevec_remove_exceptionals(&pvec);
>               check_move_unevictable_pages(pvec.pages, pvec.nr);
>               pagevec_release(&pvec);
>               cond_resched();
> @@ -465,9 +400,9 @@ static void shmem_undo_range(struct inode *inode, loff_t 
> lstart, loff_t lend,
>       pagevec_init(&pvec, 0);
>       index = start;
>       while (index < end) {
> -             pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> -                             min(end - index, (pgoff_t)PAGEVEC_SIZE),
> -                                                     pvec.pages, indices);
> +             pvec.nr = __find_get_pages(mapping, index,
> +                     min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +                     pvec.pages, indices);
>               if (!pvec.nr)
>                       break;
>               mem_cgroup_uncharge_start();
> @@ -496,7 +431,7 @@ static void shmem_undo_range(struct inode *inode, loff_t 
> lstart, loff_t lend,
>                       }
>                       unlock_page(page);
>               }
> -             shmem_deswap_pagevec(&pvec);
> +             pagevec_remove_exceptionals(&pvec);
>               pagevec_release(&pvec);
>               mem_cgroup_uncharge_end();
>               cond_resched();
> @@ -534,9 +469,10 @@ static void shmem_undo_range(struct inode *inode, loff_t 
> lstart, loff_t lend,
>       index = start;
>       for ( ; ; ) {
>               cond_resched();
> -             pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +
> +             pvec.nr = __find_get_pages(mapping, index,
>                               min(end - index, (pgoff_t)PAGEVEC_SIZE),
> -                                                     pvec.pages, indices);
> +                             pvec.pages, indices);
>               if (!pvec.nr) {
>                       if (index == start || unfalloc)
>                               break;
> @@ -544,7 +480,7 @@ static void shmem_undo_range(struct inode *inode, loff_t 
> lstart, loff_t lend,
>                       continue;
>               }
>               if ((index == start || unfalloc) && indices[0] >= end) {
> -                     shmem_deswap_pagevec(&pvec);
> +                     pagevec_remove_exceptionals(&pvec);
>                       pagevec_release(&pvec);
>                       break;
>               }
> @@ -573,7 +509,7 @@ static void shmem_undo_range(struct inode *inode, loff_t 
> lstart, loff_t lend,
>                       }
>                       unlock_page(page);
>               }
> -             shmem_deswap_pagevec(&pvec);
> +             pagevec_remove_exceptionals(&pvec);
>               pagevec_release(&pvec);
>               mem_cgroup_uncharge_end();
>               index++;
> @@ -1081,7 +1017,7 @@ static int shmem_getpage_gfp(struct inode *inode, 
> pgoff_t index,
>               return -EFBIG;
>  repeat:
>       swap.val = 0;
> -     page = find_lock_page(mapping, index);
> +     page = __find_lock_page(mapping, index);
>       if (radix_tree_exceptional_entry(page)) {
>               swap = radix_to_swp_entry(page);
>               page = NULL;
> @@ -1418,6 +1354,11 @@ static struct inode *shmem_get_inode(struct 
> super_block *sb, const struct inode
>       return inode;
>  }
>  
> +bool shmem_mapping(struct address_space *mapping)
> +{
> +     return mapping->backing_dev_info == &shmem_backing_dev_info;
> +}
> +
>  #ifdef CONFIG_TMPFS
>  static const struct inode_operations shmem_symlink_inode_operations;
>  static const struct inode_operations shmem_short_symlink_operations;
> @@ -1730,7 +1671,7 @@ static pgoff_t shmem_seek_hole_data(struct 
> address_space *mapping,
>       pagevec_init(&pvec, 0);
>       pvec.nr = 1;            /* start small: we may be there already */
>       while (!done) {
> -             pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
> +             pvec.nr = __find_get_pages(mapping, index,
>                                       pvec.nr, pvec.pages, indices);
>               if (!pvec.nr) {
>                       if (whence == SEEK_DATA)
> @@ -1757,7 +1698,7 @@ static pgoff_t shmem_seek_hole_data(struct 
> address_space *mapping,
>                               break;
>                       }
>               }
> -             shmem_deswap_pagevec(&pvec);
> +             pagevec_remove_exceptionals(&pvec);
>               pagevec_release(&pvec);
>               pvec.nr = PAGEVEC_SIZE;
>               cond_resched();
> diff --git a/mm/swap.c b/mm/swap.c
> index 759c3caf44bd..f624e5b4b724 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -894,6 +894,53 @@ EXPORT_SYMBOL(__pagevec_lru_add);
>  
>  /**
>   * pagevec_lookup - gang pagecache lookup

      __pagevec_lookup?

> + * @pvec:    Where the resulting entries are placed
> + * @mapping: The address_space to search
> + * @start:   The starting entry index
> + * @nr_pages:        The maximum number of entries

      missing @indices?

> + *
> + * pagevec_lookup() will search for and return a group of up to
> + * @nr_pages pages and shadow entries in the mapping.  All entries are
> + * placed in @pvec.  pagevec_lookup() takes a reference against actual
> + * pages in @pvec.
> + *
> + * The search returns a group of mapping-contiguous entries with
> + * ascending indexes.  There may be holes in the indices due to
> + * not-present entries.
> + *
> + * pagevec_lookup() returns the number of entries which were found.

      __pagevec_lookup

> + */
> +unsigned __pagevec_lookup(struct pagevec *pvec, struct address_space 
> *mapping,
> +                       pgoff_t start, unsigned nr_pages, pgoff_t *indices)
> +{
> +     pvec->nr = __find_get_pages(mapping, start, nr_pages,
> +                                 pvec->pages, indices);
> +     return pagevec_count(pvec);
> +}
> +
> +/**
> + * pagevec_remove_exceptionals - pagevec exceptionals pruning
> + * @pvec:    The pagevec to prune
> + *
> + * __pagevec_lookup() fills both pages and exceptional radix tree
> + * entries into the pagevec.  This function prunes all exceptionals
> + * from @pvec without leaving holes, so that it can be passed on to
> + * other pagevec operations.
> + */
> +void pagevec_remove_exceptionals(struct pagevec *pvec)
> +{
> +     int i, j;
> +
> +     for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
> +             struct page *page = pvec->pages[i];
> +             if (!radix_tree_exceptional_entry(page))
> +                     pvec->pages[j++] = page;
> +     }
> +     pvec->nr = j;
> +}
> +
> +/**
> + * pagevec_lookup - gang pagecache lookup
>   * @pvec:    Where the resulting pages are placed
>   * @mapping: The address_space to search
>   * @start:   The starting page index
> diff --git a/mm/truncate.c b/mm/truncate.c
> index 353b683afd6e..b0f4d4bee8ab 100644
> --- a/mm/truncate.c
> +++ b/mm/truncate.c
> @@ -22,6 +22,22 @@
>  #include <linux/cleancache.h>
>  #include "internal.h"
>  
> +static void clear_exceptional_entry(struct address_space *mapping,
> +                                 pgoff_t index, void *entry)
> +{
> +     /* Handled by shmem itself */
> +     if (shmem_mapping(mapping))
> +             return;
> +
> +     spin_lock_irq(&mapping->tree_lock);
> +     /*
> +      * Regular page slots are stabilized by the page lock even
> +      * without the tree itself locked.  These unlocked entries
> +      * need verification under the tree lock.
> +      */

Could you explain why repeated spin_lock with irq disabled isn't problem
in truncation path?

> +     radix_tree_delete_item(&mapping->page_tree, index, entry);
> +     spin_unlock_irq(&mapping->tree_lock);
> +}
>  
>  /**
>   * do_invalidatepage - invalidate part or all of a page
> @@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space 
> *mapping,
>       unsigned int    partial_start;  /* inclusive */
>       unsigned int    partial_end;    /* exclusive */
>       struct pagevec  pvec;
> +     pgoff_t         indices[PAGEVEC_SIZE];
>       pgoff_t         index;
>       int             i;
>  
> @@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space 
> *mapping,
>  
>       pagevec_init(&pvec, 0);
>       index = start;
> -     while (index < end && pagevec_lookup(&pvec, mapping, index,
> -                     min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> +     while (index < end && __pagevec_lookup(&pvec, mapping, index,
> +                     min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +                     indices)) {
>               mem_cgroup_uncharge_start();
>               for (i = 0; i < pagevec_count(&pvec); i++) {
>                       struct page *page = pvec.pages[i];
>  
>                       /* We rely upon deletion not changing page->index */
> -                     index = page->index;
> +                     index = indices[i];
>                       if (index >= end)
>                               break;
>  
> +                     if (radix_tree_exceptional_entry(page)) {
> +                             clear_exceptional_entry(mapping, index, page);
> +                             continue;
> +                     }
> +
>                       if (!trylock_page(page))
>                               continue;
>                       WARN_ON(page->index != index);
> @@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space 
> *mapping,
>                       truncate_inode_page(mapping, page);
>                       unlock_page(page);
>               }
> +             pagevec_remove_exceptionals(&pvec);
>               pagevec_release(&pvec);
>               mem_cgroup_uncharge_end();
>               cond_resched();
> @@ -307,14 +331,15 @@ void truncate_inode_pages_range(struct address_space 
> *mapping,
>       index = start;
>       for ( ; ; ) {
>               cond_resched();
> -             if (!pagevec_lookup(&pvec, mapping, index,
> -                     min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
> +             if (!__pagevec_lookup(&pvec, mapping, index,
> +                     min(end - index, (pgoff_t)PAGEVEC_SIZE),
> +                     indices)) {
>                       if (index == start)
>                               break;
>                       index = start;
>                       continue;
>               }
> -             if (index == start && pvec.pages[0]->index >= end) {
> +             if (index == start && indices[0] >= end) {
>                       pagevec_release(&pvec);
>                       break;
>               }
> @@ -323,16 +348,22 @@ void truncate_inode_pages_range(struct address_space 
> *mapping,
>                       struct page *page = pvec.pages[i];
>  
>                       /* We rely upon deletion not changing page->index */
> -                     index = page->index;
> +                     index = indices[i];
>                       if (index >= end)
>                               break;
>  
> +                     if (radix_tree_exceptional_entry(page)) {
> +                             clear_exceptional_entry(mapping, index, page);
> +                             continue;
> +                     }
> +
>                       lock_page(page);
>                       WARN_ON(page->index != index);
>                       wait_on_page_writeback(page);
>                       truncate_inode_page(mapping, page);
>                       unlock_page(page);
>               }
> +             pagevec_remove_exceptionals(&pvec);
>               pagevec_release(&pvec);
>               mem_cgroup_uncharge_end();
>               index++;
> @@ -375,6 +406,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
>  unsigned long invalidate_mapping_pages(struct address_space *mapping,
>               pgoff_t start, pgoff_t end)
>  {
> +     pgoff_t indices[PAGEVEC_SIZE];
>       struct pagevec pvec;
>       pgoff_t index = start;
>       unsigned long ret;
> @@ -390,17 +422,23 @@ unsigned long invalidate_mapping_pages(struct 
> address_space *mapping,
>        */
>  
>       pagevec_init(&pvec, 0);
> -     while (index <= end && pagevec_lookup(&pvec, mapping, index,
> -                     min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> +     while (index <= end && __pagevec_lookup(&pvec, mapping, index,
> +                     min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
> +                     indices)) {
>               mem_cgroup_uncharge_start();
>               for (i = 0; i < pagevec_count(&pvec); i++) {
>                       struct page *page = pvec.pages[i];
>  
>                       /* We rely upon deletion not changing page->index */
> -                     index = page->index;
> +                     index = indices[i];
>                       if (index > end)
>                               break;
>  
> +                     if (radix_tree_exceptional_entry(page)) {
> +                             clear_exceptional_entry(mapping, index, page);
> +                             continue;
> +                     }
> +
>                       if (!trylock_page(page))
>                               continue;
>                       WARN_ON(page->index != index);
> @@ -414,6 +452,7 @@ unsigned long invalidate_mapping_pages(struct 
> address_space *mapping,
>                               deactivate_page(page);
>                       count += ret;
>               }
> +             pagevec_remove_exceptionals(&pvec);
>               pagevec_release(&pvec);
>               mem_cgroup_uncharge_end();
>               cond_resched();
> @@ -481,6 +520,7 @@ static int do_launder_page(struct address_space *mapping, 
> struct page *page)
>  int invalidate_inode_pages2_range(struct address_space *mapping,
>                                 pgoff_t start, pgoff_t end)
>  {
> +     pgoff_t indices[PAGEVEC_SIZE];
>       struct pagevec pvec;
>       pgoff_t index;
>       int i;
> @@ -491,17 +531,23 @@ int invalidate_inode_pages2_range(struct address_space 
> *mapping,
>       cleancache_invalidate_inode(mapping);
>       pagevec_init(&pvec, 0);
>       index = start;
> -     while (index <= end && pagevec_lookup(&pvec, mapping, index,
> -                     min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
> +     while (index <= end && __pagevec_lookup(&pvec, mapping, index,
> +                     min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
> +                     indices)) {
>               mem_cgroup_uncharge_start();
>               for (i = 0; i < pagevec_count(&pvec); i++) {
>                       struct page *page = pvec.pages[i];
>  
>                       /* We rely upon deletion not changing page->index */
> -                     index = page->index;
> +                     index = indices[i];
>                       if (index > end)
>                               break;
>  
> +                     if (radix_tree_exceptional_entry(page)) {
> +                             clear_exceptional_entry(mapping, index, page);
> +                             continue;
> +                     }
> +
>                       lock_page(page);
>                       WARN_ON(page->index != index);
>                       if (page->mapping != mapping) {
> @@ -539,6 +585,7 @@ int invalidate_inode_pages2_range(struct address_space 
> *mapping,
>                               ret = ret2;
>                       unlock_page(page);
>               }
> +             pagevec_remove_exceptionals(&pvec);
>               pagevec_release(&pvec);
>               mem_cgroup_uncharge_end();
>               cond_resched();
> -- 
> 1.8.4.2
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"d...@kvack.org";> em...@kvack.org </a>

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to