On Fri, Dec 08, 2017 at 01:41:10PM +0800, Huang, Ying wrote:
> Minchan Kim <minc...@kernel.org> writes:
> 
> > On Thu, Dec 07, 2017 at 04:29:37PM -0800, Andrew Morton wrote:
> >> On Thu,  7 Dec 2017 09:14:26 +0800 "Huang, Ying" <ying.hu...@intel.com> 
> >> wrote:
> >> 
> >> > When the swapin is performed, after getting the swap entry information
> >> > from the page table, the PTL (page table lock) will be released, then
> >> > system will go to swap in the swap entry, without any lock held to
> >> > prevent the swap device from being swapoff.  This may cause the race
> >> > like below,
> >> > 
> >> > CPU 1                            CPU 2
> >> > -----                            -----
> >> >                          do_swap_page
> >> >                            swapin_readahead
> >> >                              __read_swap_cache_async
> >> > swapoff                                swapcache_prepare
> >> >   p->swap_map = NULL                     __swap_duplicate
> >> >                                    p->swap_map[?] /* !!! NULL pointer 
> >> > access */
> >> > 
> >> > Because swap off is usually done when system shutdown only, the race
> >> > may not hit many people in practice.  But it is still a race need to
> >> > be fixed.
> >> 
> >> swapoff is so rare that it's hard to get motivated about any fix which
> >> adds overhead to the regular codepaths.
> >
> > That was my concern, too when I see this patch.
> >
> >> 
> >> Is there something we can do to ensure that all the overhead of this
> >> fix is placed into the swapoff side?  stop_machine() may be a bit
> >> brutal, but a surprising amount of code uses it.  Any other ideas?
> >
> > How about this?
> >
> > I think It's same approach with old where we uses si->lock everywhere
> > instead of more fine-grained cluster lock.
> >
> > The reason I repeated to reset p->max to zero in the loop is to avoid
> > using lockdep annotation(maybe, spin_lock_nested(something) to prevent
> > false positive.
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 42fe5653814a..9ce007a42bbc 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -2644,6 +2644,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
> > specialfile)
> >     swap_file = p->swap_file;
> >     old_block_size = p->old_block_size;
> >     p->swap_file = NULL;
> > +
> > +   if (p->flags & SWP_SOLIDSTATE) {
> > +           unsigned long ci, nr_cluster;
> > +
> > +           nr_cluster = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
> > +           for (ci = 0; ci < nr_cluster; ci++) {
> > +                   struct swap_cluster_info *sci;
> > +
> > +                   sci = lock_cluster(p, ci * SWAPFILE_CLUSTER);
> > +                   p->max = 0;
> > +                   unlock_cluster(sci);
> > +           }
> > +   }
> >     p->max = 0;
> >     swap_map = p->swap_map;
> >     p->swap_map = NULL;
> > @@ -3369,10 +3382,10 @@ static int __swap_duplicate(swp_entry_t entry, 
> > unsigned char usage)
> >             goto bad_file;
> >     p = swap_info[type];
> >     offset = swp_offset(entry);
> > -   if (unlikely(offset >= p->max))
> > -           goto out;
> >  
> >     ci = lock_cluster_or_swap_info(p, offset);
> > +   if (unlikely(offset >= p->max))
> > +           goto unlock_out;
> >  
> >     count = p->swap_map[offset];
> >  
> 
> Sorry, this doesn't work, because
> 
> lock_cluster_or_swap_info()
> 
> Need to read p->cluster_info, which may be freed during swapoff too.
> 
> 
> To reduce the added overhead in regular code path, Maybe we can use SRCU
> to implement get_swap_device() and put_swap_device()?  There is only
> increment/decrement on CPU local variable in srcu_read_lock/unlock().
> Should be acceptable in not so hot swap path?
> 
> This needs to select CONFIG_SRCU if CONFIG_SWAP is enabled.  But I guess
> that should be acceptable too?
> 

Why do we need srcu here? Is it enough with rcu like below?

It might have a bug/room to be optimized about performance/naming.
I just wanted to show my intention.

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2417d288e016..bfe493f3bcb8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -273,6 +273,7 @@ struct swap_info_struct {
                                         */
        struct work_struct discard_work; /* discard worker */
        struct swap_cluster_list discard_clusters; /* discard clusters list */
+       struct rcu_head rcu;
 };
 
 #ifdef CONFIG_64BIT
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 42fe5653814a..ecec064f9b20 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -302,6 +302,7 @@ static inline struct swap_cluster_info 
*lock_cluster_or_swap_info(
 {
        struct swap_cluster_info *ci;
 
+       rcu_read_lock();
        ci = lock_cluster(si, offset);
        if (!ci)
                spin_lock(&si->lock);
@@ -316,6 +317,7 @@ static inline void unlock_cluster_or_swap_info(struct 
swap_info_struct *si,
                unlock_cluster(ci);
        else
                spin_unlock(&si->lock);
+       rcu_read_unlock();
 }
 
 static inline bool cluster_list_empty(struct swap_cluster_list *list)
@@ -2530,11 +2532,17 @@ bool has_usable_swap(void)
        return ret;
 }
 
+static void swap_cluster_info_free(struct rcu_head *rcu)
+{
+       struct swap_info_struct *si = container_of(rcu, struct 
swap_info_struct, rcu);
+       kvfree(si->cluster_info);
+       si->cluster_info = NULL;
+}
+
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
        struct swap_info_struct *p = NULL;
        unsigned char *swap_map;
-       struct swap_cluster_info *cluster_info;
        unsigned long *frontswap_map;
        struct file *swap_file, *victim;
        struct address_space *mapping;
@@ -2542,6 +2550,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        struct filename *pathname;
        int err, found = 0;
        unsigned int old_block_size;
+       unsigned long ci, nr_cluster;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -2631,6 +2640,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_lock(&p->lock);
        drain_mmlist();
 
+       nr_cluster = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
        /* wait for anyone still in scan_swap_map */
        p->highest_bit = 0;             /* cuts scans short */
        while (p->flags >= SWP_SCANNING) {
@@ -2641,14 +2651,33 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
specialfile)
                spin_lock(&p->lock);
        }
 
+       if (p->flags & SWP_SOLIDSTATE) {
+               struct swap_cluster_info *sci, *head_cluster;
+
+               head_cluster = p->cluster_info;
+               spin_lock(&head_cluster->lock);
+               sci = head_cluster + 1;
+               for (ci = 1; ci < nr_cluster; ci++, sci++)
+                       spin_lock_nest_lock(&sci->lock, &head_cluster->lock);
+       }
+
        swap_file = p->swap_file;
        old_block_size = p->old_block_size;
        p->swap_file = NULL;
        p->max = 0;
        swap_map = p->swap_map;
        p->swap_map = NULL;
-       cluster_info = p->cluster_info;
-       p->cluster_info = NULL;
+
+       if (p->flags & SWP_SOLIDSTATE) {
+               struct swap_cluster_info *sci, *head_cluster;
+
+               head_cluster = p->cluster_info;
+               sci = head_cluster + 1;
+               for (ci = 1; ci < nr_cluster; ci++, sci++)
+                       spin_unlock(&sci->lock);
+               spin_unlock(&head_cluster->lock);
+       }
+
        frontswap_map = frontswap_map_get(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
@@ -2658,7 +2687,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
        vfree(swap_map);
-       kvfree(cluster_info);
+       call_rcu(&p->rcu, swap_cluster_info_free);
+       synchronize_rcu();
        kvfree(frontswap_map);
        /* Destroy swap account information */
        swap_cgroup_swapoff(p->type);
@@ -3369,10 +3399,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned 
char usage)
                goto bad_file;
        p = swap_info[type];
        offset = swp_offset(entry);
-       if (unlikely(offset >= p->max))
-               goto out;
 
        ci = lock_cluster_or_swap_info(p, offset);
+       if (unlikely(offset >= p->max))
+               goto unlock_out;
 
        count = p->swap_map[offset];
 


Reply via email to