On Fri, 24 Apr 2015 21:39:04 +0800 Yuanhan Liu <yuanhan....@linux.intel.com>
wrote:

> I noticed heavy spin lock contention at get_active_stripe() with fsmark
> multiple thread write workloads.
> 
> Here is how this hot contention comes from. We have limited stripes, and
> it's a multiple thread write workload. Hence, those stripes will be taken
> soon, which puts later processes to sleep for waiting free stripes. When
> enough stripes(> 1/4 total stripes) are released, all process are woken,
> trying to get the lock. But there is one only being able to get this lock
> for each hash lock, making other processes spinning out there for acquiring
> the lock.
> 
> Thus, it's effectiveless to wakeup all processes and let them battle for
> a lock that permits one to access only each time. Instead, we could make
> it be a exclusive wake up: wake up one process only. That avoids the heavy
> spin lock contention naturally.
> 
> Here are some test results I have got with this patch applied(all test run
> 3 times):
> 
> `fsmark.files_per_sec'
> =====================
> 
> next-20150317                 this patch
> -------------------------     -------------------------
> metric_value     ±stddev      metric_value     ±stddev     change      
> testbox/benchmark/testcase-params
> -------------------------     -------------------------   --------     
> ------------------------------
>       25.600     ±0.0              92.700     ±2.5          262.1%     
> ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
>       25.600     ±0.0              77.800     ±0.6          203.9%     
> ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
>       32.000     ±0.0              93.800     ±1.7          193.1%     
> ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
>       32.000     ±0.0              81.233     ±1.7          153.9%     
> ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
>       48.800     ±14.5             99.667     ±2.0          104.2%     
> ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
>        6.400     ±0.0              12.800     ±0.0          100.0%     
> ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
>       63.133     ±8.2              82.800     ±0.7           31.2%     
> ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
>      245.067     ±0.7             306.567     ±7.9           25.1%     
> ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
>       17.533     ±0.3              21.000     ±0.8           19.8%     
> ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
>      188.167     ±1.9             215.033     ±3.1           14.3%     
> ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
>      254.500     ±1.8             290.733     ±2.4           14.2%     
> ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync
> 
> `time.system_time'
> =====================
> 
> next-20150317                 this patch
> -------------------------    -------------------------
> metric_value     ±stddev     metric_value     ±stddev     change       
> testbox/benchmark/testcase-params
> -------------------------    -------------------------    --------     
> ------------------------------
>     7235.603     ±1.2             185.163     ±1.9          -97.4%     
> ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
>     7666.883     ±2.9             202.750     ±1.0          -97.4%     
> ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
>    14567.893     ±0.7             421.230     ±0.4          -97.1%     
> ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
>     3697.667     ±14.0            148.190     ±1.7          -96.0%     
> ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
>     5572.867     ±3.8             310.717     ±1.4          -94.4%     
> ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
>     5565.050     ±0.5             313.277     ±1.5          -94.4%     
> ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
>     2420.707     ±17.1            171.043     ±2.7          -92.9%     
> ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
>     3743.300     ±4.6             379.827     ±3.5          -89.9%     
> ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose
>     3308.687     ±6.3             363.050     ±2.0          -89.0%     
> ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
> 
> Where,
> 
>      1x: where 'x' means iterations or loop, corresponding to the 'L' option 
> of fsmark
> 
>      1t, 64t: where 't' means thread
> 
>      4M: means the single file size, corresponding to the '-s' option of 
> fsmark
>      40G, 30G, 120G: means the total test size
> 
>      4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' 
> means
>                the size of one ramdisk. So, it would be 48G in total. And we 
> made a
>                raid on those ramdisk
> 
> As you can see, though there are no much performance gain for hard disk
> workload, the system time is dropped heavily, up to 97%. And as expected,
> the performance increased a lot, up to 260%, for fast device(ram disk).
> 
> Signed-off-by: Yuanhan Liu <yuanhan....@linux.intel.com>
> ---
>  drivers/md/raid5.c | 46 +++++++++++++++++++++++++++++++++++-----------
>  drivers/md/raid5.h |  2 +-
>  2 files changed, 36 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index b7e385f..2d8fcc1 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
> *conf,
>                                        int hash)
>  {
>       int size;
> -     bool do_wakeup = false;
> +     bool do_wakeup[NR_STRIPE_HASH_LOCKS] = { false, };

I think I'd rather use an 'unsigned long' and set bits.

> +     int i = 0;
>       unsigned long flags;
>  
>       if (hash == NR_STRIPE_HASH_LOCKS) {
> @@ -365,17 +366,22 @@ static void release_inactive_stripe_list(struct r5conf 
> *conf,
>                           !list_empty(list))
>                               atomic_dec(&conf->empty_inactive_list_nr);
>                       list_splice_tail_init(list, conf->inactive_list + hash);
> -                     do_wakeup = true;
> +                     do_wakeup[size - 1] = true;

... so this becomes
        do_wakeup |= 1 << (size - 1);

>                       spin_unlock_irqrestore(conf->hash_locks + hash, flags);
>               }
>               size--;
>               hash--;
>       }
>  
> -     if (do_wakeup) {
> -             wake_up(&conf->wait_for_stripe);
> -             if (conf->retry_read_aligned)
> -                     md_wakeup_thread(conf->mddev->thread);
> +     for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
> +             bool waked_thread = false;
> +             if (do_wakeup[i]) {
> +                     wake_up(&conf->wait_for_stripe[i]);
> +                     if (!waked_thread) {
> +                             waked_thread = true;
> +                             md_wakeup_thread(conf->mddev->thread);
> +                     }
> +             }

I don't think you want waked_thread to be local to this loop.
As it is, the "if (!waked_thread)" test *always* succeeds.

You can discard it if do_wakeup becomes and unsigned long, and just do

 if (do_wakeup && conf->retry_read_aligned)
        md_wakeup_thread(conf->mddev->thread);

And why have you removed the test on conf->retry_read_aligned??

>       }
>  }
>  
> @@ -655,6 +661,18 @@ static int has_failed(struct r5conf *conf)
>       return 0;
>  }
>  
> +/* XXX: might put it to linux/wait.h to be a public API? */

Yes, definitely put it in linux/wait.h


Thanks,
NeilBrown




> +#define raid_wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)     \
> +do {                                                                 \
> +     if (condition)                                                  \
> +             break;                                                  \
> +     (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0,  \
> +                         cmd1;                                       \
> +                         schedule();                                 \
> +                         cmd2);                                      \
> +} while (0)
> +
> +
>  static struct stripe_head *
>  get_active_stripe(struct r5conf *conf, sector_t sector,
>                 int previous, int noblock, int noquiesce)
> @@ -684,14 +702,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
>                       if (!sh) {
>                               set_bit(R5_INACTIVE_BLOCKED,
>                                       &conf->cache_state);
> -                             wait_event_lock_irq(
> -                                     conf->wait_for_stripe,
> +                             raid_wait_event_exclusive_cmd(
> +                                     conf->wait_for_stripe[hash],
>                                       !list_empty(conf->inactive_list + hash) 
> &&
>                                       (atomic_read(&conf->active_stripes)
>                                        < (conf->max_nr_stripes * 3 / 4)
>                                        || !test_bit(R5_INACTIVE_BLOCKED,
>                                                     &conf->cache_state)),
> -                                     *(conf->hash_locks + hash));
> +                                     spin_unlock_irq(conf->hash_locks + 
> hash),
> +                                     spin_lock_irq(conf->hash_locks + hash));
>                               clear_bit(R5_INACTIVE_BLOCKED,
>                                         &conf->cache_state);
>                       } else {
> @@ -716,6 +735,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
>               }
>       } while (sh == NULL);
>  
> +     if (!list_empty(conf->inactive_list + hash))
> +             wake_up(&conf->wait_for_stripe[hash]);
> +
>       spin_unlock_irq(conf->hash_locks + hash);
>       return sh;
>  }
> @@ -2136,7 +2158,7 @@ static int resize_stripes(struct r5conf *conf, int 
> newsize)
>       cnt = 0;
>       list_for_each_entry(nsh, &newstripes, lru) {
>               lock_device_hash_lock(conf, hash);
> -             wait_event_cmd(conf->wait_for_stripe,
> +             raid_wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
>                                   !list_empty(conf->inactive_list + hash),
>                                   unlock_device_hash_lock(conf, hash),
>                                   lock_device_hash_lock(conf, hash));
> @@ -6391,7 +6413,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
>       spin_lock_init(&conf->device_lock);
>       seqcount_init(&conf->gen_lock);
>       init_waitqueue_head(&conf->wait_for_quiesce);
> -     init_waitqueue_head(&conf->wait_for_stripe);
> +     for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
> +             init_waitqueue_head(&conf->wait_for_stripe[i]);
> +     }
>       init_waitqueue_head(&conf->wait_for_overlap);
>       INIT_LIST_HEAD(&conf->handle_list);
>       INIT_LIST_HEAD(&conf->hold_list);
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index fab53a3..cdad2d2 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -509,7 +509,7 @@ struct r5conf {
>       atomic_t                empty_inactive_list_nr;
>       struct llist_head       released_stripes;
>       wait_queue_head_t       wait_for_quiesce;
> -     wait_queue_head_t       wait_for_stripe;
> +     wait_queue_head_t       wait_for_stripe[NR_STRIPE_HASH_LOCKS];
>       wait_queue_head_t       wait_for_overlap;
>       unsigned long           cache_state;
>  #define R5_INACTIVE_BLOCKED  1       /* release of inactive stripes blocked,

Attachment: pgpNuLbGMscWl.pgp
Description: OpenPGP digital signature

Reply via email to