On Mon 06-04-15 15:58:02, Tejun Heo wrote:
> Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback)
> and the role of the separation is unclear.  For cgroup support for
> writeback IOs, a bdi will be updated to host multiple wb's where each
> wb serves writeback IOs of a different cgroup on the bdi.  To achieve
> that, a wb should carry all states necessary for servicing writeback
> IOs for a cgroup independently.
> 
> This patch moves bandwidth related fields from backing_dev_info into
> bdi_writeback.
> 
> * The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp,
>   write_bandwidth, avg_write_bandwidth, dirty_ratelimit,
>   balanced_dirty_ratelimit, completions and dirty_exceeded.
> 
> * writeback_chunk_size() and over_bgroup_thresh() now take @wb instead
>   of @bdi.
                                ^^^ over_bground_thresh()

Otherwise the patch looks good to me. You can add:
Reviewed-by: Jan Kara <j...@suse.cz>

                                                                Honza


> * bdi_writeout_fraction(bdi, ...)     -> wb_writeout_fraction(wb, ...)
>   bdi_dirty_limit(bdi, ...)           -> wb_dirty_limit(wb, ...)
>   bdi_position_ration(bdi, ...)               -> wb_position_ratio(wb, ...)
>   bdi_update_writebandwidth(bdi, ...) -> wb_update_write_bandwidth(wb, ...)
>   [__]bdi_update_bandwidth(bdi, ...)  -> [__]wb_update_bandwidth(wb, ...)
>   bdi_{max|min}_pause(bdi, ...)               -> wb_{max|min}_pause(wb, ...)
>   bdi_dirty_limits(bdi, ...)          -> wb_dirty_limits(wb, ...)
> 
> * Init/exits of the relocated fields are moved to bdi_wb_init/exit()
>   respectively.  Note that explicit zeroing is dropped in the process
>   as wb's are cleared in entirety anyway.
> 
> * As there's still only one bdi_writeback per backing_dev_info, all
>   uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[]
>   introducing no behavior changes.
> 
> Signed-off-by: Tejun Heo <t...@kernel.org>
> Cc: Jens Axboe <ax...@kernel.dk>
> Cc: Jan Kara <j...@suse.cz>
> Cc: Wu Fengguang <fengguang...@intel.com>
> Cc: Jaegeuk Kim <jaeg...@kernel.org>
> Cc: Steven Whitehouse <swhit...@redhat.com>
> ---
>  fs/f2fs/node.c                   |   4 +-
>  fs/f2fs/segment.h                |   2 +-
>  fs/fs-writeback.c                |  17 ++-
>  fs/gfs2/super.c                  |   2 +-
>  include/linux/backing-dev.h      |  20 +--
>  include/linux/writeback.h        |  19 ++-
>  include/trace/events/writeback.h |   8 +-
>  mm/backing-dev.c                 |  45 +++----
>  mm/page-writeback.c              | 262 
> ++++++++++++++++++++-------------------
>  9 files changed, 187 insertions(+), 192 deletions(-)
> 
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 97bd9d3..a97da4e 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -51,7 +51,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int 
> type)
>                                                       PAGE_CACHE_SHIFT;
>               res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
>       } else if (type == DIRTY_DENTS) {
> -             if (sbi->sb->s_bdi->dirty_exceeded)
> +             if (sbi->sb->s_bdi->wb.dirty_exceeded)
>                       return false;
>               mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
>               res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
> @@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int 
> type)
>                               sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
>               res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
>       } else {
> -             if (sbi->sb->s_bdi->dirty_exceeded)
> +             if (sbi->sb->s_bdi->wb.dirty_exceeded)
>                       return false;
>       }
>       return res;
> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> index 7fd3511..3a5bfcf 100644
> --- a/fs/f2fs/segment.h
> +++ b/fs/f2fs/segment.h
> @@ -712,7 +712,7 @@ static inline unsigned int max_hw_blocks(struct 
> f2fs_sb_info *sbi)
>   */
>  static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
>  {
> -     if (sbi->sb->s_bdi->dirty_exceeded)
> +     if (sbi->sb->s_bdi->wb.dirty_exceeded)
>               return 0;
>  
>       if (type == DATA)
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 992a065..4fcf2385 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -606,7 +606,7 @@ out:
>       return ret;
>  }
>  
> -static long writeback_chunk_size(struct backing_dev_info *bdi,
> +static long writeback_chunk_size(struct bdi_writeback *wb,
>                                struct wb_writeback_work *work)
>  {
>       long pages;
> @@ -627,7 +627,7 @@ static long writeback_chunk_size(struct backing_dev_info 
> *bdi,
>       if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
>               pages = LONG_MAX;
>       else {
> -             pages = min(bdi->avg_write_bandwidth / 2,
> +             pages = min(wb->avg_write_bandwidth / 2,
>                           global_dirty_limit / DIRTY_SCOPE);
>               pages = min(pages, work->nr_pages);
>               pages = round_down(pages + MIN_WRITEBACK_PAGES,
> @@ -725,7 +725,7 @@ static long writeback_sb_inodes(struct super_block *sb,
>               inode->i_state |= I_SYNC;
>               spin_unlock(&inode->i_lock);
>  
> -             write_chunk = writeback_chunk_size(wb->bdi, work);
> +             write_chunk = writeback_chunk_size(wb, work);
>               wbc.nr_to_write = write_chunk;
>               wbc.pages_skipped = 0;
>  
> @@ -812,7 +812,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, 
> long nr_pages,
>       return nr_pages - work.nr_pages;
>  }
>  
> -static bool over_bground_thresh(struct backing_dev_info *bdi)
> +static bool over_bground_thresh(struct bdi_writeback *wb)
>  {
>       unsigned long background_thresh, dirty_thresh;
>  
> @@ -822,8 +822,7 @@ static bool over_bground_thresh(struct backing_dev_info 
> *bdi)
>           global_page_state(NR_UNSTABLE_NFS) > background_thresh)
>               return true;
>  
> -     if (wb_stat(&bdi->wb, WB_RECLAIMABLE) >
> -                             bdi_dirty_limit(bdi, background_thresh))
> +     if (wb_stat(wb, WB_RECLAIMABLE) > wb_dirty_limit(wb, background_thresh))
>               return true;
>  
>       return false;
> @@ -836,7 +835,7 @@ static bool over_bground_thresh(struct backing_dev_info 
> *bdi)
>  static void wb_update_bandwidth(struct bdi_writeback *wb,
>                               unsigned long start_time)
>  {
> -     __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
> +     __wb_update_bandwidth(wb, 0, 0, 0, 0, 0, start_time);
>  }
>  
>  /*
> @@ -888,7 +887,7 @@ static long wb_writeback(struct bdi_writeback *wb,
>                * For background writeout, stop when we are below the
>                * background dirty threshold
>                */
> -             if (work->for_background && !over_bground_thresh(wb->bdi))
> +             if (work->for_background && !over_bground_thresh(wb))
>                       break;
>  
>               /*
> @@ -980,7 +979,7 @@ static unsigned long get_nr_dirty_pages(void)
>  
>  static long wb_check_background_flush(struct bdi_writeback *wb)
>  {
> -     if (over_bground_thresh(wb->bdi)) {
> +     if (over_bground_thresh(wb)) {
>  
>               struct wb_writeback_work work = {
>                       .nr_pages       = LONG_MAX,
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index 1666382..5b4c4d35 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct 
> writeback_control *wbc)
>  
>       if (wbc->sync_mode == WB_SYNC_ALL)
>               gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
> -     if (bdi->dirty_exceeded)
> +     if (bdi->wb.dirty_exceeded)
>               gfs2_ail1_flush(sdp, wbc);
>       else
>               filemap_fdatawrite(metamapping);
> diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> index fe7a907..2ab0604 100644
> --- a/include/linux/backing-dev.h
> +++ b/include/linux/backing-dev.h
> @@ -60,16 +60,6 @@ struct bdi_writeback {
>       spinlock_t list_lock;           /* protects the b_* lists */
>  
>       struct percpu_counter stat[NR_WB_STAT_ITEMS];
> -};
> -
> -struct backing_dev_info {
> -     struct list_head bdi_list;
> -     unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
> -     unsigned int capabilities; /* Device capabilities */
> -     congested_fn *congested_fn; /* Function pointer if device is md/dm */
> -     void *congested_data;   /* Pointer to aux data for congested func */
> -
> -     char *name;
>  
>       unsigned long bw_time_stamp;    /* last time write bw is updated */
>       unsigned long dirtied_stamp;
> @@ -88,6 +78,16 @@ struct backing_dev_info {
>  
>       struct fprop_local_percpu completions;
>       int dirty_exceeded;
> +};
> +
> +struct backing_dev_info {
> +     struct list_head bdi_list;
> +     unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
> +     unsigned int capabilities; /* Device capabilities */
> +     congested_fn *congested_fn; /* Function pointer if device is md/dm */
> +     void *congested_data;   /* Pointer to aux data for congested func */
> +
> +     char *name;
>  
>       unsigned int min_ratio;
>       unsigned int max_ratio, max_prop_frac;
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index 0004833..8e4485f 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -152,16 +152,15 @@ int dirty_writeback_centisecs_handler(struct ctl_table 
> *, int,
>                                     void __user *, size_t *, loff_t *);
>  
>  void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
> -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
> -                            unsigned long dirty);
> -
> -void __bdi_update_bandwidth(struct backing_dev_info *bdi,
> -                         unsigned long thresh,
> -                         unsigned long bg_thresh,
> -                         unsigned long dirty,
> -                         unsigned long bdi_thresh,
> -                         unsigned long bdi_dirty,
> -                         unsigned long start_time);
> +unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty);
> +
> +void __wb_update_bandwidth(struct bdi_writeback *wb,
> +                        unsigned long thresh,
> +                        unsigned long bg_thresh,
> +                        unsigned long dirty,
> +                        unsigned long bdi_thresh,
> +                        unsigned long bdi_dirty,
> +                        unsigned long start_time);
>  
>  void page_writeback_init(void);
>  void balance_dirty_pages_ratelimited(struct address_space *mapping);
> diff --git a/include/trace/events/writeback.h 
> b/include/trace/events/writeback.h
> index 5a14ead..5c9a68c 100644
> --- a/include/trace/events/writeback.h
> +++ b/include/trace/events/writeback.h
> @@ -383,13 +383,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
>  
>       TP_fast_assign(
>               strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
> -             __entry->write_bw       = KBps(bdi->write_bandwidth);
> -             __entry->avg_write_bw   = KBps(bdi->avg_write_bandwidth);
> +             __entry->write_bw       = KBps(bdi->wb.write_bandwidth);
> +             __entry->avg_write_bw   = KBps(bdi->wb.avg_write_bandwidth);
>               __entry->dirty_rate     = KBps(dirty_rate);
> -             __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit);
> +             __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
>               __entry->task_ratelimit = KBps(task_ratelimit);
>               __entry->balanced_dirty_ratelimit =
> -                                       KBps(bdi->balanced_dirty_ratelimit);
> +                                     KBps(bdi->wb.balanced_dirty_ratelimit);
>       ),
>  
>       TP_printk("bdi %s: "
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 7b1d191..9a6c472 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -66,7 +66,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>       spin_unlock(&wb->list_lock);
>  
>       global_dirty_limits(&background_thresh, &dirty_thresh);
> -     bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
> +     bdi_thresh = wb_dirty_limit(wb, dirty_thresh);
>  
>  #define K(x) ((x) << (PAGE_SHIFT - 10))
>       seq_printf(m,
> @@ -91,7 +91,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
>                  K(background_thresh),
>                  (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
>                  (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
> -                (unsigned long) K(bdi->write_bandwidth),
> +                (unsigned long) K(wb->write_bandwidth),
>                  nr_dirty,
>                  nr_io,
>                  nr_more_io,
> @@ -376,6 +376,11 @@ void bdi_unregister(struct backing_dev_info *bdi)
>  }
>  EXPORT_SYMBOL(bdi_unregister);
>  
> +/*
> + * Initial write bandwidth: 100 MB/s
> + */
> +#define INIT_BW              (100 << (20 - PAGE_SHIFT))
> +
>  static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info 
> *bdi)
>  {
>       int i, err;
> @@ -391,11 +396,22 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct 
> backing_dev_info *bdi)
>       spin_lock_init(&wb->list_lock);
>       INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
>  
> +     wb->bw_time_stamp = jiffies;
> +     wb->balanced_dirty_ratelimit = INIT_BW;
> +     wb->dirty_ratelimit = INIT_BW;
> +     wb->write_bandwidth = INIT_BW;
> +     wb->avg_write_bandwidth = INIT_BW;
> +
> +     err = fprop_local_init_percpu(&wb->completions, GFP_KERNEL);
> +     if (err)
> +             return err;
> +
>       for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
>               err = percpu_counter_init(&wb->stat[i], 0, GFP_KERNEL);
>               if (err) {
>                       while (--i)
>                               percpu_counter_destroy(&wb->stat[i]);
> +                     fprop_local_destroy_percpu(&wb->completions);
>                       return err;
>               }
>       }
> @@ -411,12 +427,9 @@ static void bdi_wb_exit(struct bdi_writeback *wb)
>  
>       for (i = 0; i < NR_WB_STAT_ITEMS; i++)
>               percpu_counter_destroy(&wb->stat[i]);
> -}
>  
> -/*
> - * Initial write bandwidth: 100 MB/s
> - */
> -#define INIT_BW              (100 << (20 - PAGE_SHIFT))
> +     fprop_local_destroy_percpu(&wb->completions);
> +}
>  
>  int bdi_init(struct backing_dev_info *bdi)
>  {
> @@ -435,22 +448,6 @@ int bdi_init(struct backing_dev_info *bdi)
>       if (err)
>               return err;
>  
> -     bdi->dirty_exceeded = 0;
> -
> -     bdi->bw_time_stamp = jiffies;
> -     bdi->written_stamp = 0;
> -
> -     bdi->balanced_dirty_ratelimit = INIT_BW;
> -     bdi->dirty_ratelimit = INIT_BW;
> -     bdi->write_bandwidth = INIT_BW;
> -     bdi->avg_write_bandwidth = INIT_BW;
> -
> -     err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
> -     if (err) {
> -             bdi_wb_exit(&bdi->wb);
> -             return err;
> -     }
> -
>       return 0;
>  }
>  EXPORT_SYMBOL(bdi_init);
> @@ -468,8 +465,6 @@ void bdi_destroy(struct backing_dev_info *bdi)
>       }
>  
>       bdi_wb_exit(&bdi->wb);
> -
> -     fprop_local_destroy_percpu(&bdi->completions);
>  }
>  EXPORT_SYMBOL(bdi_destroy);
>  
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index af3edb6..29fb4f3 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -399,7 +399,7 @@ static unsigned long wp_next_time(unsigned long cur_time)
>  static inline void __wb_writeout_inc(struct bdi_writeback *wb)
>  {
>       __inc_wb_stat(wb, WB_WRITTEN);
> -     __fprop_inc_percpu_max(&writeout_completions, &wb->bdi->completions,
> +     __fprop_inc_percpu_max(&writeout_completions, &wb->completions,
>                              wb->bdi->max_prop_frac);
>       /* First event after period switching was turned off? */
>       if (!unlikely(writeout_period_time)) {
> @@ -427,10 +427,10 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
>  /*
>   * Obtain an accurate fraction of the BDI's portion.
>   */
> -static void bdi_writeout_fraction(struct backing_dev_info *bdi,
> -             long *numerator, long *denominator)
> +static void wb_writeout_fraction(struct bdi_writeback *wb,
> +                              long *numerator, long *denominator)
>  {
> -     fprop_fraction_percpu(&writeout_completions, &bdi->completions,
> +     fprop_fraction_percpu(&writeout_completions, &wb->completions,
>                               numerator, denominator);
>  }
>  
> @@ -516,11 +516,11 @@ static unsigned long hard_dirty_limit(unsigned long 
> thresh)
>  }
>  
>  /**
> - * bdi_dirty_limit - @bdi's share of dirty throttling threshold
> - * @bdi: the backing_dev_info to query
> + * wb_dirty_limit - @wb's share of dirty throttling threshold
> + * @wb: bdi_writeback to query
>   * @dirty: global dirty limit in pages
>   *
> - * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
> + * Returns @wb's dirty limit in pages. The term "dirty" in the context of
>   * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable 
> pages.
>   *
>   * Note that balance_dirty_pages() will only seriously take it as a hard 
> limit
> @@ -528,34 +528,35 @@ static unsigned long hard_dirty_limit(unsigned long 
> thresh)
>   * control. For example, when the device is completely stalled due to some 
> error
>   * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB 
> key.
>   * In the other normal situations, it acts more gently by throttling the 
> tasks
> - * more (rather than completely block them) when the bdi dirty pages go high.
> + * more (rather than completely block them) when the wb dirty pages go high.
>   *
>   * It allocates high/low dirty limits to fast/slow devices, in order to 
> prevent
>   * - starving fast devices
>   * - piling up dirty pages (that will take long time to sync) on slow devices
>   *
> - * The bdi's share of dirty limit will be adapting to its throughput and
> + * The wb's share of dirty limit will be adapting to its throughput and
>   * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
>   */
> -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long 
> dirty)
> +unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty)
>  {
> -     u64 bdi_dirty;
> +     struct backing_dev_info *bdi = wb->bdi;
> +     u64 wb_dirty;
>       long numerator, denominator;
>  
>       /*
>        * Calculate this BDI's share of the dirty ratio.
>        */
> -     bdi_writeout_fraction(bdi, &numerator, &denominator);
> +     wb_writeout_fraction(wb, &numerator, &denominator);
>  
> -     bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
> -     bdi_dirty *= numerator;
> -     do_div(bdi_dirty, denominator);
> +     wb_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
> +     wb_dirty *= numerator;
> +     do_div(wb_dirty, denominator);
>  
> -     bdi_dirty += (dirty * bdi->min_ratio) / 100;
> -     if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
> -             bdi_dirty = dirty * bdi->max_ratio / 100;
> +     wb_dirty += (dirty * bdi->min_ratio) / 100;
> +     if (wb_dirty > (dirty * bdi->max_ratio) / 100)
> +             wb_dirty = dirty * bdi->max_ratio / 100;
>  
> -     return bdi_dirty;
> +     return wb_dirty;
>  }
>  
>  /*
> @@ -664,14 +665,14 @@ static long long pos_ratio_polynom(unsigned long 
> setpoint,
>   *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
>   * - the bdi dirty thresh drops quickly due to change of JBOD workload
>   */
> -static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
> -                                     unsigned long thresh,
> -                                     unsigned long bg_thresh,
> -                                     unsigned long dirty,
> -                                     unsigned long bdi_thresh,
> -                                     unsigned long bdi_dirty)
> +static unsigned long wb_position_ratio(struct bdi_writeback *wb,
> +                                    unsigned long thresh,
> +                                    unsigned long bg_thresh,
> +                                    unsigned long dirty,
> +                                    unsigned long bdi_thresh,
> +                                    unsigned long bdi_dirty)
>  {
> -     unsigned long write_bw = bdi->avg_write_bandwidth;
> +     unsigned long write_bw = wb->avg_write_bandwidth;
>       unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
>       unsigned long limit = hard_dirty_limit(thresh);
>       unsigned long x_intercept;
> @@ -702,12 +703,12 @@ static unsigned long bdi_position_ratio(struct 
> backing_dev_info *bdi,
>        * consume arbitrary amount of RAM because it is accounted in
>        * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
>        *
> -      * Here, in bdi_position_ratio(), we calculate pos_ratio based on
> +      * Here, in wb_position_ratio(), we calculate pos_ratio based on
>        * two values: bdi_dirty and bdi_thresh. Let's consider an example:
>        * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
>        * limits are set by default to 10% and 20% (background and throttle).
>        * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
> -      * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
> +      * wb_dirty_limit(wb, bg_thresh) is about ~4K pages. bdi_setpoint is
>        * about ~6K pages (as the average of background and throttle bdi
>        * limits). The 3rd order polynomial will provide positive feedback if
>        * bdi_dirty is under bdi_setpoint and vice versa.
> @@ -717,7 +718,7 @@ static unsigned long bdi_position_ratio(struct 
> backing_dev_info *bdi,
>        * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
>        * in the example above).
>        */
> -     if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
> +     if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
>               long long bdi_pos_ratio;
>               unsigned long bdi_bg_thresh;
>  
> @@ -842,13 +843,13 @@ static unsigned long bdi_position_ratio(struct 
> backing_dev_info *bdi,
>       return pos_ratio;
>  }
>  
> -static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
> -                                    unsigned long elapsed,
> -                                    unsigned long written)
> +static void wb_update_write_bandwidth(struct bdi_writeback *wb,
> +                                   unsigned long elapsed,
> +                                   unsigned long written)
>  {
>       const unsigned long period = roundup_pow_of_two(3 * HZ);
> -     unsigned long avg = bdi->avg_write_bandwidth;
> -     unsigned long old = bdi->write_bandwidth;
> +     unsigned long avg = wb->avg_write_bandwidth;
> +     unsigned long old = wb->write_bandwidth;
>       u64 bw;
>  
>       /*
> @@ -861,14 +862,14 @@ static void bdi_update_write_bandwidth(struct 
> backing_dev_info *bdi,
>        * @written may have decreased due to account_page_redirty().
>        * Avoid underflowing @bw calculation.
>        */
> -     bw = written - min(written, bdi->written_stamp);
> +     bw = written - min(written, wb->written_stamp);
>       bw *= HZ;
>       if (unlikely(elapsed > period)) {
>               do_div(bw, elapsed);
>               avg = bw;
>               goto out;
>       }
> -     bw += (u64)bdi->write_bandwidth * (period - elapsed);
> +     bw += (u64)wb->write_bandwidth * (period - elapsed);
>       bw >>= ilog2(period);
>  
>       /*
> @@ -881,8 +882,8 @@ static void bdi_update_write_bandwidth(struct 
> backing_dev_info *bdi,
>               avg += (old - avg) >> 3;
>  
>  out:
> -     bdi->write_bandwidth = bw;
> -     bdi->avg_write_bandwidth = avg;
> +     wb->write_bandwidth = bw;
> +     wb->avg_write_bandwidth = avg;
>  }
>  
>  /*
> @@ -947,20 +948,20 @@ static void global_update_bandwidth(unsigned long 
> thresh,
>   * Normal bdi tasks will be curbed at or below it in long term.
>   * Obviously it should be around (write_bw / N) when there are N dd tasks.
>   */
> -static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
> -                                    unsigned long thresh,
> -                                    unsigned long bg_thresh,
> -                                    unsigned long dirty,
> -                                    unsigned long bdi_thresh,
> -                                    unsigned long bdi_dirty,
> -                                    unsigned long dirtied,
> -                                    unsigned long elapsed)
> +static void wb_update_dirty_ratelimit(struct bdi_writeback *wb,
> +                                   unsigned long thresh,
> +                                   unsigned long bg_thresh,
> +                                   unsigned long dirty,
> +                                   unsigned long bdi_thresh,
> +                                   unsigned long bdi_dirty,
> +                                   unsigned long dirtied,
> +                                   unsigned long elapsed)
>  {
>       unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
>       unsigned long limit = hard_dirty_limit(thresh);
>       unsigned long setpoint = (freerun + limit) / 2;
> -     unsigned long write_bw = bdi->avg_write_bandwidth;
> -     unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
> +     unsigned long write_bw = wb->avg_write_bandwidth;
> +     unsigned long dirty_ratelimit = wb->dirty_ratelimit;
>       unsigned long dirty_rate;
>       unsigned long task_ratelimit;
>       unsigned long balanced_dirty_ratelimit;
> @@ -972,10 +973,10 @@ static void bdi_update_dirty_ratelimit(struct 
> backing_dev_info *bdi,
>        * The dirty rate will match the writeout rate in long term, except
>        * when dirty pages are truncated by userspace or re-dirtied by FS.
>        */
> -     dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
> +     dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
>  
> -     pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
> -                                    bdi_thresh, bdi_dirty);
> +     pos_ratio = wb_position_ratio(wb, thresh, bg_thresh, dirty,
> +                                   bdi_thresh, bdi_dirty);
>       /*
>        * task_ratelimit reflects each dd's dirty rate for the past 200ms.
>        */
> @@ -1059,31 +1060,31 @@ static void bdi_update_dirty_ratelimit(struct 
> backing_dev_info *bdi,
>  
>       /*
>        * For strictlimit case, calculations above were based on bdi counters
> -      * and limits (starting from pos_ratio = bdi_position_ratio() and up to
> +      * and limits (starting from pos_ratio = wb_position_ratio() and up to
>        * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
>        * Hence, to calculate "step" properly, we have to use bdi_dirty as
>        * "dirty" and bdi_setpoint as "setpoint".
>        *
>        * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
>        * it's possible that bdi_thresh is close to zero due to inactivity
> -      * of backing device (see the implementation of bdi_dirty_limit()).
> +      * of backing device (see the implementation of wb_dirty_limit()).
>        */
> -     if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
> +     if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
>               dirty = bdi_dirty;
>               if (bdi_dirty < 8)
>                       setpoint = bdi_dirty + 1;
>               else
>                       setpoint = (bdi_thresh +
> -                                 bdi_dirty_limit(bdi, bg_thresh)) / 2;
> +                                 wb_dirty_limit(wb, bg_thresh)) / 2;
>       }
>  
>       if (dirty < setpoint) {
> -             x = min3(bdi->balanced_dirty_ratelimit,
> +             x = min3(wb->balanced_dirty_ratelimit,
>                        balanced_dirty_ratelimit, task_ratelimit);
>               if (dirty_ratelimit < x)
>                       step = x - dirty_ratelimit;
>       } else {
> -             x = max3(bdi->balanced_dirty_ratelimit,
> +             x = max3(wb->balanced_dirty_ratelimit,
>                        balanced_dirty_ratelimit, task_ratelimit);
>               if (dirty_ratelimit > x)
>                       step = dirty_ratelimit - x;
> @@ -1105,22 +1106,22 @@ static void bdi_update_dirty_ratelimit(struct 
> backing_dev_info *bdi,
>       else
>               dirty_ratelimit -= step;
>  
> -     bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
> -     bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
> +     wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
> +     wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
>  
> -     trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
> +     trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
>  }
>  
> -void __bdi_update_bandwidth(struct backing_dev_info *bdi,
> -                         unsigned long thresh,
> -                         unsigned long bg_thresh,
> -                         unsigned long dirty,
> -                         unsigned long bdi_thresh,
> -                         unsigned long bdi_dirty,
> -                         unsigned long start_time)
> +void __wb_update_bandwidth(struct bdi_writeback *wb,
> +                        unsigned long thresh,
> +                        unsigned long bg_thresh,
> +                        unsigned long dirty,
> +                        unsigned long bdi_thresh,
> +                        unsigned long bdi_dirty,
> +                        unsigned long start_time)
>  {
>       unsigned long now = jiffies;
> -     unsigned long elapsed = now - bdi->bw_time_stamp;
> +     unsigned long elapsed = now - wb->bw_time_stamp;
>       unsigned long dirtied;
>       unsigned long written;
>  
> @@ -1130,44 +1131,44 @@ void __bdi_update_bandwidth(struct backing_dev_info 
> *bdi,
>       if (elapsed < BANDWIDTH_INTERVAL)
>               return;
>  
> -     dirtied = percpu_counter_read(&bdi->wb.stat[WB_DIRTIED]);
> -     written = percpu_counter_read(&bdi->wb.stat[WB_WRITTEN]);
> +     dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
> +     written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
>  
>       /*
>        * Skip quiet periods when disk bandwidth is under-utilized.
>        * (at least 1s idle time between two flusher runs)
>        */
> -     if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
> +     if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
>               goto snapshot;
>  
>       if (thresh) {
>               global_update_bandwidth(thresh, dirty, now);
> -             bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
> -                                        bdi_thresh, bdi_dirty,
> -                                        dirtied, elapsed);
> +             wb_update_dirty_ratelimit(wb, thresh, bg_thresh, dirty,
> +                                       bdi_thresh, bdi_dirty,
> +                                       dirtied, elapsed);
>       }
> -     bdi_update_write_bandwidth(bdi, elapsed, written);
> +     wb_update_write_bandwidth(wb, elapsed, written);
>  
>  snapshot:
> -     bdi->dirtied_stamp = dirtied;
> -     bdi->written_stamp = written;
> -     bdi->bw_time_stamp = now;
> +     wb->dirtied_stamp = dirtied;
> +     wb->written_stamp = written;
> +     wb->bw_time_stamp = now;
>  }
>  
> -static void bdi_update_bandwidth(struct backing_dev_info *bdi,
> -                              unsigned long thresh,
> -                              unsigned long bg_thresh,
> -                              unsigned long dirty,
> -                              unsigned long bdi_thresh,
> -                              unsigned long bdi_dirty,
> -                              unsigned long start_time)
> +static void wb_update_bandwidth(struct bdi_writeback *wb,
> +                             unsigned long thresh,
> +                             unsigned long bg_thresh,
> +                             unsigned long dirty,
> +                             unsigned long bdi_thresh,
> +                             unsigned long bdi_dirty,
> +                             unsigned long start_time)
>  {
> -     if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
> +     if (time_is_after_eq_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL))
>               return;
> -     spin_lock(&bdi->wb.list_lock);
> -     __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
> -                            bdi_thresh, bdi_dirty, start_time);
> -     spin_unlock(&bdi->wb.list_lock);
> +     spin_lock(&wb->list_lock);
> +     __wb_update_bandwidth(wb, thresh, bg_thresh, dirty,
> +                           bdi_thresh, bdi_dirty, start_time);
> +     spin_unlock(&wb->list_lock);
>  }
>  
>  /*
> @@ -1187,10 +1188,10 @@ static unsigned long dirty_poll_interval(unsigned 
> long dirty,
>       return 1;
>  }
>  
> -static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
> -                                unsigned long bdi_dirty)
> +static unsigned long wb_max_pause(struct bdi_writeback *wb,
> +                                   unsigned long bdi_dirty)
>  {
> -     unsigned long bw = bdi->avg_write_bandwidth;
> +     unsigned long bw = wb->avg_write_bandwidth;
>       unsigned long t;
>  
>       /*
> @@ -1206,14 +1207,14 @@ static unsigned long bdi_max_pause(struct 
> backing_dev_info *bdi,
>       return min_t(unsigned long, t, MAX_PAUSE);
>  }
>  
> -static long bdi_min_pause(struct backing_dev_info *bdi,
> -                       long max_pause,
> -                       unsigned long task_ratelimit,
> -                       unsigned long dirty_ratelimit,
> -                       int *nr_dirtied_pause)
> +static long wb_min_pause(struct bdi_writeback *wb,
> +                      long max_pause,
> +                      unsigned long task_ratelimit,
> +                      unsigned long dirty_ratelimit,
> +                      int *nr_dirtied_pause)
>  {
> -     long hi = ilog2(bdi->avg_write_bandwidth);
> -     long lo = ilog2(bdi->dirty_ratelimit);
> +     long hi = ilog2(wb->avg_write_bandwidth);
> +     long lo = ilog2(wb->dirty_ratelimit);
>       long t;         /* target pause */
>       long pause;     /* estimated next pause */
>       int pages;      /* target nr_dirtied_pause */
> @@ -1281,14 +1282,13 @@ static long bdi_min_pause(struct backing_dev_info 
> *bdi,
>       return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
>  }
>  
> -static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
> -                                 unsigned long dirty_thresh,
> -                                 unsigned long background_thresh,
> -                                 unsigned long *bdi_dirty,
> -                                 unsigned long *bdi_thresh,
> -                                 unsigned long *bdi_bg_thresh)
> +static inline void wb_dirty_limits(struct bdi_writeback *wb,
> +                                unsigned long dirty_thresh,
> +                                unsigned long background_thresh,
> +                                unsigned long *bdi_dirty,
> +                                unsigned long *bdi_thresh,
> +                                unsigned long *bdi_bg_thresh)
>  {
> -     struct bdi_writeback *wb = &bdi->wb;
>       unsigned long wb_reclaimable;
>  
>       /*
> @@ -1301,10 +1301,10 @@ static inline void bdi_dirty_limits(struct 
> backing_dev_info *bdi,
>        *   In this case we don't want to hard throttle the USB key
>        *   dirtiers for 100 seconds until bdi_dirty drops under
>        *   bdi_thresh. Instead the auxiliary bdi control line in
> -      *   bdi_position_ratio() will let the dirtier task progress
> +      *   wb_position_ratio() will let the dirtier task progress
>        *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
>        */
> -     *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
> +     *bdi_thresh = wb_dirty_limit(wb, dirty_thresh);
>  
>       if (bdi_bg_thresh)
>               *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
> @@ -1354,6 +1354,7 @@ static void balance_dirty_pages(struct address_space 
> *mapping,
>       unsigned long dirty_ratelimit;
>       unsigned long pos_ratio;
>       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
> +     struct bdi_writeback *wb = &bdi->wb;
>       bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
>       unsigned long start_time = jiffies;
>  
> @@ -1378,8 +1379,8 @@ static void balance_dirty_pages(struct address_space 
> *mapping,
>               global_dirty_limits(&background_thresh, &dirty_thresh);
>  
>               if (unlikely(strictlimit)) {
> -                     bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
> -                                      &bdi_dirty, &bdi_thresh, &bg_thresh);
> +                     wb_dirty_limits(wb, dirty_thresh, background_thresh,
> +                                     &bdi_dirty, &bdi_thresh, &bg_thresh);
>  
>                       dirty = bdi_dirty;
>                       thresh = bdi_thresh;
> @@ -1410,28 +1411,28 @@ static void balance_dirty_pages(struct address_space 
> *mapping,
>                       bdi_start_background_writeback(bdi);
>  
>               if (!strictlimit)
> -                     bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
> -                                      &bdi_dirty, &bdi_thresh, NULL);
> +                     wb_dirty_limits(wb, dirty_thresh, background_thresh,
> +                                     &bdi_dirty, &bdi_thresh, NULL);
>  
>               dirty_exceeded = (bdi_dirty > bdi_thresh) &&
>                                ((nr_dirty > dirty_thresh) || strictlimit);
> -             if (dirty_exceeded && !bdi->dirty_exceeded)
> -                     bdi->dirty_exceeded = 1;
> +             if (dirty_exceeded && !wb->dirty_exceeded)
> +                     wb->dirty_exceeded = 1;
>  
> -             bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
> -                                  nr_dirty, bdi_thresh, bdi_dirty,
> -                                  start_time);
> +             wb_update_bandwidth(wb, dirty_thresh, background_thresh,
> +                                 nr_dirty, bdi_thresh, bdi_dirty,
> +                                 start_time);
>  
> -             dirty_ratelimit = bdi->dirty_ratelimit;
> -             pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
> -                                            background_thresh, nr_dirty,
> -                                            bdi_thresh, bdi_dirty);
> +             dirty_ratelimit = wb->dirty_ratelimit;
> +             pos_ratio = wb_position_ratio(wb, dirty_thresh,
> +                                           background_thresh, nr_dirty,
> +                                           bdi_thresh, bdi_dirty);
>               task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
>                                                       RATELIMIT_CALC_SHIFT;
> -             max_pause = bdi_max_pause(bdi, bdi_dirty);
> -             min_pause = bdi_min_pause(bdi, max_pause,
> -                                       task_ratelimit, dirty_ratelimit,
> -                                       &nr_dirtied_pause);
> +             max_pause = wb_max_pause(wb, bdi_dirty);
> +             min_pause = wb_min_pause(wb, max_pause,
> +                                      task_ratelimit, dirty_ratelimit,
> +                                      &nr_dirtied_pause);
>  
>               if (unlikely(task_ratelimit == 0)) {
>                       period = max_pause;
> @@ -1515,15 +1516,15 @@ pause:
>                * more page. However bdi_dirty has accounting errors.  So use
>                * the larger and more IO friendly wb_stat_error.
>                */
> -             if (bdi_dirty <= wb_stat_error(&bdi->wb))
> +             if (bdi_dirty <= wb_stat_error(wb))
>                       break;
>  
>               if (fatal_signal_pending(current))
>                       break;
>       }
>  
> -     if (!dirty_exceeded && bdi->dirty_exceeded)
> -             bdi->dirty_exceeded = 0;
> +     if (!dirty_exceeded && wb->dirty_exceeded)
> +             wb->dirty_exceeded = 0;
>  
>       if (writeback_in_progress(bdi))
>               return;
> @@ -1577,6 +1578,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
>  void balance_dirty_pages_ratelimited(struct address_space *mapping)
>  {
>       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
> +     struct bdi_writeback *wb = &bdi->wb;
>       int ratelimit;
>       int *p;
>  
> @@ -1584,7 +1586,7 @@ void balance_dirty_pages_ratelimited(struct 
> address_space *mapping)
>               return;
>  
>       ratelimit = current->nr_dirtied_pause;
> -     if (bdi->dirty_exceeded)
> +     if (wb->dirty_exceeded)
>               ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
>  
>       preempt_disable();
> -- 
> 2.1.0
> 
-- 
Jan Kara <j...@suse.cz>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to