Re: [RFC 2/4] lightnvm: add write buffering for rrpc

Matias Bjørling Fri, 05 Feb 2016 06:53:20 -0800

On 02/04/2016 02:08 PM, Javier González wrote:
> Flash controllers typically define flash pages as a collection of flash
> sectors of typically 4K. Moreover, flash controllers might program flash
> pages across several planes. This defines the write granurality at which
> flash can be programmed. This is different for each flash controller. In
> rrpc, the assumption has been that flash pages are 4K, thus writes could
> be sent out to the media as they were received.
> 
> This patch adds a per-block write buffer to rrpc. Writes are flushed to
> the media in chuncks of the minimum granurality allowed by the
> controller, though the strategy can be changed to, for example, only
> flush at the maximum supported (64 pages in nvme). Apart from enabling
> the use of rrpc in real hardware, this buffering strategy will be used
> for recovery; if a block becomes bad, a new block can be allocated to
> persist valid data.


Great work. The code is nearly 2x'ed in size. Can you add a detailed
explanation on the inner workings of the write buffering to help new
users to understand its logic?

> 
> Signed-off-by: Javier González <jav...@cnexlabs.com>
> ---
>  drivers/lightnvm/rrpc.c  | 851 
> ++++++++++++++++++++++++++++++++++++++---------
>  drivers/lightnvm/rrpc.h  |  82 ++++-
>  include/linux/lightnvm.h |   6 +-
>  3 files changed, 771 insertions(+), 168 deletions(-)
> 
> diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
> index 8187bf3..e9fb19d 100644
> --- a/drivers/lightnvm/rrpc.c
> +++ b/drivers/lightnvm/rrpc.c
> @@ -16,11 +16,12 @@
>  
>  #include "rrpc.h"
>  
> -static struct kmem_cache *rrpc_gcb_cache, *rrpc_rq_cache;
> +static struct kmem_cache *rrpc_gcb_cache, *rrpc_rq_cache, *rrpc_rrq_cache,
> +                     *rrpc_buf_rrq_cache, *rrpc_wb_cache, *rrpc_block_cache;
>  static DECLARE_RWSEM(rrpc_lock);
>  
>  static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
> -                             struct nvm_rq *rqd, unsigned long flags);
> +                             struct rrpc_rq *rqd, unsigned long flags);
>  
>  #define rrpc_for_each_lun(rrpc, rlun, i) \
>               for ((i) = 0, rlun = &(rrpc)->luns[0]; \
> @@ -62,53 +63,59 @@ static void rrpc_invalidate_range(struct rrpc *rrpc, 
> sector_t slba,
>       spin_unlock(&rrpc->rev_lock);
>  }
>  
> -static struct nvm_rq *rrpc_inflight_laddr_acquire(struct rrpc *rrpc,
> +static void rrpc_release_and_free_rrqd(struct kref *ref)
> +{
> +     struct rrpc_rq *rrqd = container_of(ref, struct rrpc_rq, refs);
> +     struct rrpc *rrpc = rrqd->addr->rblk->rlun->rrpc;

Wow.. Happy that we got to rrpc :)

> +
> +     rrpc_unlock_rq(rrpc, rrqd);
> +     mempool_free(rrqd, rrpc->rrq_pool);
> +}
> +
> +static struct rrpc_rq *rrpc_inflight_laddr_acquire(struct rrpc *rrpc,
>                                       sector_t laddr, unsigned int pages)
>  {
> -     struct nvm_rq *rqd;
> +     struct rrpc_rq *rrqd;
>       struct rrpc_inflight_rq *inf;
>  
> -     rqd = mempool_alloc(rrpc->rq_pool, GFP_ATOMIC);
> -     if (!rqd)
> +     rrqd = mempool_alloc(rrpc->rrq_pool, GFP_ATOMIC);
> +     if (!rrqd)
>               return ERR_PTR(-ENOMEM);
> +     kref_init(&rrqd->refs);
>  
> -     inf = rrpc_get_inflight_rq(rqd);
> +     inf = rrpc_get_inflight_rq(rrqd);
>       if (rrpc_lock_laddr(rrpc, laddr, pages, inf)) {
> -             mempool_free(rqd, rrpc->rq_pool);
> +             mempool_free(rrqd, rrpc->rrq_pool);
>               return NULL;
>       }
>  
> -     return rqd;
> +     return rrqd;
>  }
>  
> -static void rrpc_inflight_laddr_release(struct rrpc *rrpc, struct nvm_rq 
> *rqd)
> +static void rrpc_inflight_laddr_release(struct rrpc *rrpc, struct rrpc_rq 
> *rrqd)
>  {
> -     struct rrpc_inflight_rq *inf = rrpc_get_inflight_rq(rqd);
> -
> -     rrpc_unlock_laddr(rrpc, inf);
> -
> -     mempool_free(rqd, rrpc->rq_pool);
> +     kref_put(&rrqd->refs, rrpc_release_and_free_rrqd);
>  }
>  
>  static void rrpc_discard(struct rrpc *rrpc, struct bio *bio)
>  {
>       sector_t slba = bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
>       sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
> -     struct nvm_rq *rqd;
> +     struct rrpc_rq *rrqd;
>  
>       do {
> -             rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len);
> +             rrqd = rrpc_inflight_laddr_acquire(rrpc, slba, len);
>               schedule();
> -     } while (!rqd);
> +     } while (!rrqd);
>  
> -     if (IS_ERR(rqd)) {
> +     if (IS_ERR(rrqd)) {
>               pr_err("rrpc: unable to acquire inflight IO\n");
>               bio_io_error(bio);
>               return;
>       }
>  
>       rrpc_invalidate_range(rrpc, slba, len);
> -     rrpc_inflight_laddr_release(rrpc, rqd);
> +     rrpc_inflight_laddr_release(rrpc, rrqd);
>  }
>  
>  static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk)
> @@ -166,8 +173,6 @@ static void rrpc_set_lun_cur(struct rrpc_lun *rlun, 
> struct rrpc_block *rblk)
>  {
>       struct rrpc *rrpc = rlun->rrpc;
>  
> -     BUG_ON(!rblk);
> -
>       if (rlun->cur) {
>               spin_lock(&rlun->cur->lock);
>               WARN_ON(!block_is_full(rrpc, rlun->cur));
> @@ -176,12 +181,107 @@ static void rrpc_set_lun_cur(struct rrpc_lun *rlun, 
> struct rrpc_block *rblk)
>       rlun->cur = rblk;
>  }
>  
> +static void rrpc_free_w_buffer(struct rrpc *rrpc, struct rrpc_block *rblk)
> +{
> +try:
> +     spin_lock(&rblk->w_buf.s_lock);
> +     if (atomic_read(&rblk->w_buf.refs) > 0) {
> +             spin_unlock(&rblk->w_buf.s_lock);
> +             schedule();
> +             goto try;
> +     }
> +
> +     mempool_free(rblk->w_buf.entries, rrpc->block_pool);
> +     rblk->w_buf.entries = NULL;
> +     spin_unlock(&rblk->w_buf.s_lock);
> +
> +     /* TODO: Reuse the same buffers if the block size is the same */
> +     mempool_free(rblk->w_buf.data, rrpc->write_buf_pool);
> +     kfree(rblk->w_buf.sync_bitmap);
> +
> +     rblk->w_buf.mem = NULL;
> +     rblk->w_buf.subm = NULL;
> +     rblk->w_buf.sync_bitmap = NULL;
> +     rblk->w_buf.data = NULL;
> +     rblk->w_buf.nentries = 0;
> +     rblk->w_buf.cur_mem = 0;
> +     rblk->w_buf.cur_subm = 0;
> +}
> +
> +static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
> +{
> +     struct rrpc_lun *rlun = rblk->rlun;
> +     struct nvm_lun *lun = rlun->parent;
> +     struct rrpc_w_buf *buf = &rblk->w_buf;
> +
> +try:
> +     spin_lock(&buf->w_lock);
> +     /* Flush inflight I/Os */
> +     if (!bitmap_full(buf->sync_bitmap, buf->cur_mem)) {
> +             spin_unlock(&buf->w_lock);
> +             schedule();
> +             goto try;
> +     }
> +     spin_unlock(&buf->w_lock);
> +
> +     if (rblk->w_buf.entries)
> +             rrpc_free_w_buffer(rrpc, rblk);
> +
> +     spin_lock(&lun->lock);
> +     nvm_put_blk_unlocked(rrpc->dev, rblk->parent);
> +     list_del(&rblk->list);
> +     spin_unlock(&lun->lock);
> +}
> +
> +static void rrpc_put_blks(struct rrpc *rrpc)
> +{
> +     struct rrpc_lun *rlun;
> +     int i;
> +
> +     for (i = 0; i < rrpc->nr_luns; i++) {
> +             rlun = &rrpc->luns[i];
> +             if (rlun->cur)
> +                     rrpc_put_blk(rrpc, rlun->cur);
> +             if (rlun->gc_cur)
> +                     rrpc_put_blk(rrpc, rlun->gc_cur);
> +     }
> +}
> +
>  static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun 
> *rlun,
>                                                       unsigned long flags)
>  {
> +     struct nvm_dev *dev = rrpc->dev;
>       struct nvm_lun *lun = rlun->parent;
>       struct nvm_block *blk;
>       struct rrpc_block *rblk;
> +     struct buf_entry *entries;
> +     unsigned long *sync_bitmap;
> +     void *data;
> +     int nentries = dev->pgs_per_blk * dev->sec_per_pg;
> +
> +     data = mempool_alloc(rrpc->write_buf_pool, GFP_ATOMIC);
> +     if (!data) {
> +             pr_err("nvm: rrpc: cannot allocate write buffer for block\n");
> +             return NULL;
> +     }
> +
> +     entries = mempool_alloc(rrpc->block_pool, GFP_ATOMIC);
> +     if (!entries) {
> +             pr_err("nvm: rrpc: cannot allocate write buffer for block\n");
> +             mempool_free(data, rrpc->write_buf_pool);
> +             return NULL;
> +     }
> +
> +     /* TODO: Mempool? */

Not on fast path. I don't think we need mempools at all to take some
unnecessary memory.

> +     sync_bitmap = kmalloc(BITS_TO_LONGS(nentries) *
> +                                     sizeof(unsigned long), GFP_ATOMIC);
> +     if (!sync_bitmap) {
> +             mempool_free(data, rrpc->write_buf_pool);
> +             mempool_free(entries, rrpc->block_pool);
> +             return NULL;
> +     }
> +
> +     bitmap_zero(sync_bitmap, nentries);
>  
>       spin_lock(&lun->lock);
>       blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags);
> @@ -192,43 +292,34 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc 
> *rrpc, struct rrpc_lun *rlun,
>       }
>  
>       rblk = &rlun->blocks[blk->id];
> -     list_add_tail(&rblk->list, &rlun->open_list);
> -     spin_unlock(&lun->lock);
>  
>       blk->priv = rblk;
> -     bitmap_zero(rblk->invalid_pages, rrpc->dev->pgs_per_blk);
> +     bitmap_zero(rblk->invalid_pages, dev->pgs_per_blk);
>       rblk->next_page = 0;
>       rblk->nr_invalid_pages = 0;
> -     atomic_set(&rblk->data_cmnt_size, 0);
> +
> +     rblk->w_buf.data = data;
> +     rblk->w_buf.entries = entries;
> +     rblk->w_buf.sync_bitmap = sync_bitmap;
> +
> +     rblk->w_buf.entries->data  = rblk->w_buf.data;
> +     rblk->w_buf.mem = rblk->w_buf.entries;
> +     rblk->w_buf.subm = rblk->w_buf.entries;
> +     rblk->w_buf.nentries = nentries;
> +     rblk->w_buf.cur_mem = 0;
> +     rblk->w_buf.cur_subm = 0;
> +
> +     atomic_set(&rblk->w_buf.refs, 0);
> +
> +     spin_lock_init(&rblk->w_buf.w_lock);
> +     spin_lock_init(&rblk->w_buf.s_lock);
> +
> +     list_add_tail(&rblk->list, &rlun->open_list);
> +     spin_unlock(&lun->lock);
>  
>       return rblk;
>  }
>  
> -static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
> -{
> -     struct rrpc_lun *rlun = rblk->rlun;
> -     struct nvm_lun *lun = rlun->parent;
> -
> -     spin_lock(&lun->lock);
> -     nvm_put_blk_unlocked(rrpc->dev, rblk->parent);
> -     list_del(&rblk->list);
> -     spin_unlock(&lun->lock);
> -}
> -
> -static void rrpc_put_blks(struct rrpc *rrpc)
> -{
> -     struct rrpc_lun *rlun;
> -     int i;
> -
> -     for (i = 0; i < rrpc->nr_luns; i++) {
> -             rlun = &rrpc->luns[i];
> -             if (rlun->cur)
> -                     rrpc_put_blk(rrpc, rlun->cur);
> -             if (rlun->gc_cur)
> -                     rrpc_put_blk(rrpc, rlun->gc_cur);
> -     }
> -}
> -
>  static struct rrpc_lun *get_next_lun(struct rrpc *rrpc)
>  {
>       int next = atomic_inc_return(&rrpc->next_lun);
> @@ -247,6 +338,17 @@ static void rrpc_gc_kick(struct rrpc *rrpc)
>       }
>  }
>  
> +static void rrpc_writer_kick(struct rrpc *rrpc)
> +{
> +     struct rrpc_lun *rlun;
> +     unsigned int i;
> +
> +     for (i = 0; i < rrpc->nr_luns; i++) {
> +             rlun = &rrpc->luns[i];
> +             queue_work(rrpc->kw_wq, &rlun->ws_writer);
> +     }
> +}
> +
>  /*
>   * timed GC every interval.
>   */
> @@ -282,7 +384,7 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, 
> struct rrpc_block *rblk)
>  {
>       struct request_queue *q = rrpc->dev->q;
>       struct rrpc_rev_addr *rev;
> -     struct nvm_rq *rqd;
> +     struct rrpc_rq *rrqd;
>       struct bio *bio;
>       struct page *page;
>       int slot;
> @@ -306,7 +408,7 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, 
> struct rrpc_block *rblk)
>       }
>  
>       while ((slot = find_first_zero_bit(rblk->invalid_pages,
> -                                         nr_pgs_per_blk)) < nr_pgs_per_blk) {
> +                                     nr_pgs_per_blk)) < nr_pgs_per_blk) {

No need to fix the indentation here.

>  
>               /* Lock laddr */
>               phys_addr = (rblk->parent->id * nr_pgs_per_blk) + slot;
> @@ -321,8 +423,8 @@ try:
>                       continue;
>               }
>  
> -             rqd = rrpc_inflight_laddr_acquire(rrpc, rev->addr, 1);
> -             if (IS_ERR_OR_NULL(rqd)) {
> +             rrqd = rrpc_inflight_laddr_acquire(rrpc, rev->addr, 1);
> +             if (IS_ERR_OR_NULL(rrqd)) {
>                       spin_unlock(&rrpc->rev_lock);
>                       schedule();
>                       goto try;
> @@ -339,14 +441,14 @@ try:
>               /* TODO: may fail when EXP_PG_SIZE > PAGE_SIZE */
>               bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0);
>  
> -             if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) {
> +             if (rrpc_submit_io(rrpc, bio, rrqd, NVM_IOTYPE_GC)) {
>                       pr_err("rrpc: gc read failed.\n");
> -                     rrpc_inflight_laddr_release(rrpc, rqd);
> +                     rrpc_inflight_laddr_release(rrpc, rrqd);
>                       goto finished;
>               }
>               wait_for_completion_io(&wait);
>               if (bio->bi_error) {
> -                     rrpc_inflight_laddr_release(rrpc, rqd);
> +                     rrpc_inflight_laddr_release(rrpc, rrqd);
>                       goto finished;
>               }
>  
> @@ -363,14 +465,16 @@ try:
>               /* turn the command around and write the data back to a new
>                * address
>                */
> -             if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) {
> +             if (rrpc_submit_io(rrpc, bio, rrqd, NVM_IOTYPE_GC)
> +                                                     != NVM_IO_DONE) {
>                       pr_err("rrpc: gc write failed.\n");
> -                     rrpc_inflight_laddr_release(rrpc, rqd);
> +                     rrpc_inflight_laddr_release(rrpc, rrqd);
>                       goto finished;
>               }
> +             bio_endio(bio);
>               wait_for_completion_io(&wait);
>  
> -             rrpc_inflight_laddr_release(rrpc, rqd);
> +             /* rrpc_inflight_laddr_release(rrpc, rrqd); */

This is commented out.

>               if (bio->bi_error)
>                       goto finished;
>  
> @@ -514,6 +618,8 @@ static void rrpc_gc_queue(struct work_struct *work)
>       list_move_tail(&rblk->list, &rlun->closed_list);
>       spin_unlock(&lun->lock);
>  
> +     rrpc_free_w_buffer(rrpc, rblk);
> +
>       mempool_free(gcb, rrpc->gcb_pool);
>       pr_debug("nvm: block '%lu' is full, allow GC (sched)\n",
>                                                       rblk->parent->id);
> @@ -663,43 +769,72 @@ static void rrpc_run_gc(struct rrpc *rrpc, struct 
> rrpc_block *rblk)
>       queue_work(rrpc->kgc_wq, &gcb->ws_gc);
>  }
>  
> -static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
> -                                             sector_t laddr, uint8_t npages)
> +static void rrpc_sync_buffer(struct rrpc *rrpc, struct rrpc_addr *p)
>  {
> -     struct rrpc_addr *p;
>       struct rrpc_block *rblk;
> +     struct rrpc_w_buf *buf;
>       struct nvm_lun *lun;
> -     int cmnt_size, i;
> +     unsigned long bppa;
>  
> -     for (i = 0; i < npages; i++) {
> -             p = &rrpc->trans_map[laddr + i];
> -             rblk = p->rblk;
> -             lun = rblk->parent->lun;
> +     rblk = p->rblk;
> +     buf = &rblk->w_buf;
> +     lun = rblk->parent->lun;
>  
> -             cmnt_size = atomic_inc_return(&rblk->data_cmnt_size);
> -             if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk))
> -                     rrpc_run_gc(rrpc, rblk);
> +     bppa = rrpc->dev->sec_per_blk * rblk->parent->id;
> +
> +     WARN_ON(test_and_set_bit((p->addr - bppa), buf->sync_bitmap));
> +
> +     if (unlikely(bitmap_full(buf->sync_bitmap, buf->nentries))) {
> +             /* Write buffer out-of-bounds */
> +             WARN_ON((buf->cur_mem != buf->nentries) &&
> +                                     (buf->cur_mem != buf->cur_subm));
> +
> +             rrpc_run_gc(rrpc, rblk);
> +     }
> +}
> +
> +static void rrpc_end_io_write(struct rrpc *rrpc, struct nvm_rq *rqd,
> +                                                     uint8_t nr_pages)
> +{
> +     struct rrpc_buf_rq *brrqd = nvm_rq_to_pdu(rqd);
> +     struct rrpc_rq *rrqd;
> +     int i;
> +
> +     for (i = 0; i < nr_pages; i++) {
> +             rrqd = brrqd[i].rrqd;
> +             rrpc_sync_buffer(rrpc, brrqd[i].addr);
> +             kref_put(&rrqd->refs, rrpc_release_and_free_rrqd);
>       }
> +
> +     mempool_free(brrqd, rrpc->m_rrq_pool);
> +     rrpc_writer_kick(rrpc);
> +}
> +
> +static void rrpc_end_io_read(struct rrpc *rrpc, struct nvm_rq *rqd,
> +                                                     uint8_t nr_pages)
> +{
> +     struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
> +
> +     if (rqd->flags & NVM_IOTYPE_GC)
> +             return;
> +
> +     rrpc_unlock_rq(rrpc, rrqd);
> +     mempool_free(rrqd, rrpc->rrq_pool);
>  }
>  
>  static void rrpc_end_io(struct nvm_rq *rqd)
>  {
>       struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance);
> -     struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
> -     uint8_t npages = rqd->nr_pages;
> -     sector_t laddr = rrpc_get_laddr(rqd->bio) - npages;
> +     uint8_t nr_pages = rqd->nr_pages;
>  
>       if (bio_data_dir(rqd->bio) == WRITE)
> -             rrpc_end_io_write(rrpc, rrqd, laddr, npages);
> +             rrpc_end_io_write(rrpc, rqd, nr_pages);
> +     else
> +             rrpc_end_io_read(rrpc, rqd, nr_pages);
>  
>       bio_put(rqd->bio);
>  
> -     if (rrqd->flags & NVM_IOTYPE_GC)
> -             return;
> -
> -     rrpc_unlock_rq(rrpc, rqd);
> -
> -     if (npages > 1)
> +     if (nr_pages > 1)

Cleanup patches can go in afterwards or before.

>               nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
>       if (rqd->metadata)
>               nvm_dev_dma_free(rrpc->dev, rqd->metadata, rqd->dma_metadata);
> @@ -708,20 +843,24 @@ static void rrpc_end_io(struct nvm_rq *rqd)
>  }
>  
>  static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
> -                     struct nvm_rq *rqd, unsigned long flags, int npages)
> +                     struct nvm_rq *rqd, struct rrpc_buf_rq *brrqd,
> +                     unsigned long flags, int nr_pages)
>  {
> -     struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
> +     struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
> +     struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rrqd);
>       struct rrpc_addr *gp;
>       sector_t laddr = rrpc_get_laddr(bio);
>       int is_gc = flags & NVM_IOTYPE_GC;
>       int i;
>  
> -     if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
> +     if (!is_gc && rrpc_lock_rq(rrpc, bio, rrqd)) {
>               nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
> +             mempool_free(rrqd, rrpc->rrq_pool);
> +             mempool_free(rqd, rrpc->rq_pool);

Seems like there is a relationship here between rrq_pool (rrpc_rq) and
rq_pool (nvm_rq)

To get to a single mempool alloc ( and just keep the nvm_rq mempool )



>               return NVM_IO_REQUEUE;
>       }
>  
> -     for (i = 0; i < npages; i++) {
> +     for (i = 0; i < nr_pages; i++) {
>               /* We assume that mapping occurs at 4KB granularity */
>               BUG_ON(!(laddr + i >= 0 && laddr + i < rrpc->nr_sects));
>               gp = &rrpc->trans_map[laddr + i];
> @@ -734,8 +873,12 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, 
> struct bio *bio,
>                       rrpc_unlock_laddr(rrpc, r);
>                       nvm_dev_dma_free(rrpc->dev, rqd->ppa_list,
>                                                       rqd->dma_ppa_list);
> +                     mempool_free(rrqd, rrpc->rrq_pool);
> +                     mempool_free(rqd, rrpc->rq_pool);
>                       return NVM_IO_DONE;
>               }
> +
> +             brrqd[i].addr = gp;
>       }
>  
>       rqd->opcode = NVM_OP_HBREAD;
> @@ -751,8 +894,11 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio 
> *bio, struct nvm_rq *rqd,
>       sector_t laddr = rrpc_get_laddr(bio);
>       struct rrpc_addr *gp;
>  
> -     if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
> +     if (!is_gc && rrpc_lock_rq(rrpc, bio, rrqd)) {
> +             mempool_free(rrqd, rrpc->rrq_pool);
> +             mempool_free(rqd, rrpc->rq_pool);
>               return NVM_IO_REQUEUE;
> +     }
>  
>       BUG_ON(!(laddr >= 0 && laddr < rrpc->nr_sects));
>       gp = &rrpc->trans_map[laddr];
> @@ -761,7 +907,9 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio 
> *bio, struct nvm_rq *rqd,
>               rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, gp->addr);
>       } else {
>               BUG_ON(is_gc);
> -             rrpc_unlock_rq(rrpc, rqd);
> +             rrpc_unlock_rq(rrpc, rrqd);
> +             mempool_free(rrqd, rrpc->rrq_pool);
> +             mempool_free(rqd, rrpc->rq_pool);
>               return NVM_IO_DONE;
>       }
>  
> @@ -771,120 +919,190 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio 
> *bio, struct nvm_rq *rqd,
>       return NVM_IO_OK;
>  }
>  
> +/*
> + * Copy data from current bio to block write buffer. This if necessary
> + * to guarantee durability if a flash block becomes bad before all pages
> + * are written. This buffer is also used to write at the right page
> + * granurality
> + */
> +static int rrpc_write_to_buffer(struct rrpc *rrpc, struct bio *bio,
> +                             struct rrpc_rq *rrqd, struct rrpc_addr *addr,
> +                             struct rrpc_w_buf *w_buf,
> +                             unsigned long flags)
> +{
> +     struct nvm_dev *dev = rrpc->dev;
> +     unsigned int bio_len = bio_cur_bytes(bio);
> +
> +     if (bio_len != RRPC_EXPOSED_PAGE_SIZE)
> +             return NVM_IO_ERR;
> +
> +     spin_lock(&w_buf->w_lock);
> +
> +     WARN_ON(w_buf->cur_mem == w_buf->nentries);
> +
> +     w_buf->mem->rrqd = rrqd;
> +     w_buf->mem->addr = addr;
> +     w_buf->mem->flags = flags;
> +
> +     memcpy(w_buf->mem->data, bio_data(bio), bio_len);
> +
> +     w_buf->cur_mem++;
> +     if (likely(w_buf->cur_mem < w_buf->nentries)) {
> +             w_buf->mem++;
> +             w_buf->mem->data =
> +                             w_buf->data + (w_buf->cur_mem * dev->sec_size);
> +     }
> +
> +     spin_unlock(&w_buf->w_lock);
> +
> +     return 0;
> +}
> +
>  static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
> -                     struct nvm_rq *rqd, unsigned long flags, int npages)
> +                     struct rrpc_rq *rrqd, unsigned long flags, int nr_pages)
>  {
> -     struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
> +     struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rrqd);
> +     struct rrpc_w_buf *w_buf;
>       struct rrpc_addr *p;
> +     struct rrpc_lun *rlun;
>       sector_t laddr = rrpc_get_laddr(bio);
>       int is_gc = flags & NVM_IOTYPE_GC;
> +     int err;
>       int i;
>  
> -     if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
> -             nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
> +     if (!is_gc && rrpc_lock_rq(rrpc, bio, rrqd)) {
> +             mempool_free(rrqd, rrpc->rrq_pool);
>               return NVM_IO_REQUEUE;
>       }
>  
> -     for (i = 0; i < npages; i++) {
> +     for (i = 0; i < nr_pages; i++) {
> +             kref_get(&rrqd->refs);
> +
>               /* We assume that mapping occurs at 4KB granularity */
>               p = rrpc_map_page(rrpc, laddr + i, is_gc);
>               if (!p) {
>                       BUG_ON(is_gc);
>                       rrpc_unlock_laddr(rrpc, r);
> -                     nvm_dev_dma_free(rrpc->dev, rqd->ppa_list,
> -                                                     rqd->dma_ppa_list);
> +                     mempool_free(rrqd, rrpc->rrq_pool);
>                       rrpc_gc_kick(rrpc);
>                       return NVM_IO_REQUEUE;
>               }
>  
> -             rqd->ppa_list[i] = rrpc_ppa_to_gaddr(rrpc->dev,
> -                                                             p->addr);
> +             w_buf = &p->rblk->w_buf;
> +             rlun = p->rblk->rlun;
> +
> +             rrqd->addr = p;
> +
> +             err = rrpc_write_to_buffer(rrpc, bio, rrqd, p, w_buf, flags);
> +             if (err) {
> +                     pr_err("rrpc: could not write to write buffer\n");
> +                     return err;
> +             }
> +
> +             bio_advance(bio, RRPC_EXPOSED_PAGE_SIZE);
> +
> +             queue_work(rrpc->kw_wq, &rlun->ws_writer);

How about a timer and only the kick when a flush command is sent?

>       }
>  
> -     rqd->opcode = NVM_OP_HBWRITE;
> +     if (kref_put(&rrqd->refs, rrpc_release_and_free_rrqd)) {
> +             pr_err("rrpc: request reference counter dailed\n");
> +             return NVM_IO_ERR;
> +     }
>  
> -     return NVM_IO_OK;
> +     return NVM_IO_DONE;
>  }
>  
>  static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
> -                             struct nvm_rq *rqd, unsigned long flags)
> +                             struct rrpc_rq *rrqd, unsigned long flags)
>  {
> -     struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
> +     struct rrpc_w_buf *w_buf;
>       struct rrpc_addr *p;
> +     struct rrpc_lun *rlun;
>       int is_gc = flags & NVM_IOTYPE_GC;
> +     int err;
>       sector_t laddr = rrpc_get_laddr(bio);
>  
> -     if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
> +     if (!is_gc && rrpc_lock_rq(rrpc, bio, rrqd)) {
> +             mempool_free(rrqd, rrpc->rrq_pool);
>               return NVM_IO_REQUEUE;
> +     }
>  
>       p = rrpc_map_page(rrpc, laddr, is_gc);
>       if (!p) {
>               BUG_ON(is_gc);
> -             rrpc_unlock_rq(rrpc, rqd);
> +             rrpc_unlock_rq(rrpc, rrqd);
> +             mempool_free(rrqd, rrpc->rrq_pool);
>               rrpc_gc_kick(rrpc);
>               return NVM_IO_REQUEUE;
>       }
>  
> -     rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, p->addr);
> -     rqd->opcode = NVM_OP_HBWRITE;
> +     w_buf = &p->rblk->w_buf;
> +     rlun = p->rblk->rlun;
> +
>       rrqd->addr = p;
>  
> -     return NVM_IO_OK;
> +     err = rrpc_write_to_buffer(rrpc, bio, rrqd, p, w_buf, flags);
> +     if (err) {
> +             pr_err("rrpc: could not write to write buffer\n");
> +             return err;
> +     }
> +
> +     queue_work(rrpc->kw_wq, &rlun->ws_writer);
> +     return NVM_IO_DONE;
>  }
>  
> -static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio,
> -                     struct nvm_rq *rqd, unsigned long flags, uint8_t npages)
> +static int rrpc_submit_read(struct rrpc *rrpc, struct bio *bio,
> +                             struct rrpc_rq *rrqd, unsigned long flags)
>  {
> -     if (npages > 1) {
> +     struct nvm_rq *rqd;
> +     struct rrpc_buf_rq brrqd[rrpc->max_write_pgs];
> +     uint8_t nr_pages = rrpc_get_pages(bio);
> +     int err;
> +
> +     rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
> +     if (!rqd) {
> +             pr_err_ratelimited("rrpc: not able to queue bio.");
> +             bio_io_error(bio);
> +             return NVM_IO_ERR;
> +     }
> +     rqd->metadata = NULL;
> +     rqd->priv = rrqd;
> +
> +     if (nr_pages > 1) {
>               rqd->ppa_list = nvm_dev_dma_alloc(rrpc->dev, GFP_KERNEL,
> -                                                     &rqd->dma_ppa_list);
> +                                             &rqd->dma_ppa_list);
>               if (!rqd->ppa_list) {
>                       pr_err("rrpc: not able to allocate ppa list\n");
> +                     mempool_free(rrqd, rrpc->rrq_pool);
> +                     mempool_free(rqd, rrpc->rq_pool);
>                       return NVM_IO_ERR;
>               }
>  
> -             if (bio_rw(bio) == WRITE)
> -                     return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags,
> -                                                                     npages);
> -
> -             return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages);
> +             err = rrpc_read_ppalist_rq(rrpc, bio, rqd, brrqd, flags,
> +                                                             nr_pages);
> +             if (err) {
> +                     mempool_free(rrqd, rrpc->rrq_pool);
> +                     mempool_free(rqd, rrpc->rq_pool);
> +                     return err;
> +             }
> +     } else {
> +             err = rrpc_read_rq(rrpc, bio, rqd, flags);
> +             if (err)
> +                     return err;
>       }
>  
> -     if (bio_rw(bio) == WRITE)
> -             return rrpc_write_rq(rrpc, bio, rqd, flags);
> -
> -     return rrpc_read_rq(rrpc, bio, rqd, flags);
> -}
> -
> -static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
> -                             struct nvm_rq *rqd, unsigned long flags)
> -{
> -     int err;
> -     struct rrpc_rq *rrq = nvm_rq_to_pdu(rqd);
> -     uint8_t nr_pages = rrpc_get_pages(bio);
> -     int bio_size = bio_sectors(bio) << 9;
> -
> -     if (bio_size < rrpc->dev->sec_size)
> -             return NVM_IO_ERR;
> -     else if (bio_size > rrpc->dev->max_rq_size)
> -             return NVM_IO_ERR;
> -
> -     err = rrpc_setup_rq(rrpc, bio, rqd, flags, nr_pages);
> -     if (err)
> -             return err;
> -
>       bio_get(bio);
>       rqd->bio = bio;
>       rqd->ins = &rrpc->instance;
> -     rqd->nr_pages = nr_pages;
> -     rrq->flags = flags;
> +     rqd->nr_pages = rrqd->nr_pages = nr_pages;
> +     rqd->flags = flags;
>  
>       err = nvm_submit_io(rrpc->dev, rqd);
>       if (err) {
>               pr_err("rrpc: I/O submission failed: %d\n", err);
>               bio_put(bio);
>               if (!(flags & NVM_IOTYPE_GC)) {
> -                     rrpc_unlock_rq(rrpc, rqd);
> +                     rrpc_unlock_rq(rrpc, rrqd);
>                       if (rqd->nr_pages > 1)
>                               nvm_dev_dma_free(rrpc->dev,
>                       rqd->ppa_list, rqd->dma_ppa_list);
> @@ -895,10 +1113,39 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct 
> bio *bio,
>       return NVM_IO_OK;
>  }
>  
> +static int rrpc_buffer_write(struct rrpc *rrpc, struct bio *bio,
> +                             struct rrpc_rq *rrqd, unsigned long flags)
> +{
> +     uint8_t nr_pages = rrpc_get_pages(bio);
> +
> +     rrqd->nr_pages = nr_pages;
> +
> +     if (nr_pages > 1)
> +             return rrpc_write_ppalist_rq(rrpc, bio, rrqd, flags, nr_pages);
> +     else

You can skip the else here.

> +             return rrpc_write_rq(rrpc, bio, rrqd, flags);
> +}
> +
> +static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
> +                             struct rrpc_rq *rrqd, unsigned long flags)
> +{
> +     int bio_size = bio_sectors(bio) << 9;
> +
> +     if (bio_size < rrpc->dev->sec_size)
> +             return NVM_IO_ERR;
> +     else if (bio_size > rrpc->dev->max_rq_size)
> +             return NVM_IO_ERR;

I have a patch incoming that removes this. Properly making
rrpc_submit_io obsolete.

> +
> +     if (bio_rw(bio) == READ)
> +             return rrpc_submit_read(rrpc, bio, rrqd, flags);
> +
> +     return rrpc_buffer_write(rrpc, bio, rrqd, flags);
> +}
> +
>  static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
>  {
>       struct rrpc *rrpc = q->queuedata;
> -     struct nvm_rq *rqd;
> +     struct rrpc_rq *rrqd;
>       int err;
>  
>       if (bio->bi_rw & REQ_DISCARD) {
> @@ -906,15 +1153,15 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, 
> struct bio *bio)
>               return BLK_QC_T_NONE;
>       }
>  
> -     rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
> -     if (!rqd) {
> -             pr_err_ratelimited("rrpc: not able to queue bio.");
> +     rrqd = mempool_alloc(rrpc->rrq_pool, GFP_KERNEL);
> +     if (!rrqd) {
> +             pr_err_ratelimited("rrpc: not able to allocate rrqd.");
>               bio_io_error(bio);
>               return BLK_QC_T_NONE;
>       }
> -     memset(rqd, 0, sizeof(struct nvm_rq));

It seems like we don't use nvm_rq in the write patch? (the writer thread
will do the write itself). Move this inside read? and use the original
nvm_rq for submission?

> +     kref_init(&rrqd->refs);
>  
> -     err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE);
> +     err = rrpc_submit_io(rrpc, bio, rrqd, NVM_IOTYPE_NONE);
>       switch (err) {
>       case NVM_IO_OK:
>               return BLK_QC_T_NONE;
> @@ -932,10 +1179,221 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, 
> struct bio *bio)
>               break;
>       }
>  
> -     mempool_free(rqd, rrpc->rq_pool);
>       return BLK_QC_T_NONE;
>  }
>  
> +static int rrpc_alloc_page_in_bio(struct rrpc *rrpc, struct bio *bio,
> +                                                             void *data)
> +{
> +     struct page *page;
> +     int err;
> +
> +     if (PAGE_SIZE != RRPC_EXPOSED_PAGE_SIZE)
> +             return -1;

return -EINVAL?

This doesn't work on power and sparch architectures (with larger
PAGE_SIZEs). Would be great to handle that as well.

> +
> +     page = virt_to_page(data);
> +     if (!page) {
> +             pr_err("nvm: rrpc: could not alloc page\n");
> +             return -1;
> +     }
> +
> +     err = bio_add_page(bio, page, RRPC_EXPOSED_PAGE_SIZE, 0);
> +     if (err != RRPC_EXPOSED_PAGE_SIZE) {
> +             pr_err("nvm: rrpc: could not add page to bio\n");
> +             return -1;
> +     }
> +
> +     return 0;
> +}
> +
> +static void rrpc_submit_write(struct work_struct *work)
> +{
> +     struct rrpc_lun *rlun = container_of(work, struct rrpc_lun, ws_writer);
> +     struct rrpc *rrpc = rlun->rrpc;
> +     struct nvm_dev *dev = rrpc->dev;
> +     struct rrpc_addr *addr;
> +     struct rrpc_rq *rrqd;
> +     struct rrpc_buf_rq *brrqd;
> +     void *data;
> +     struct nvm_rq *rqd;
> +     struct rrpc_block *rblk;
> +     struct bio *bio;
> +     int pgs_to_sync, pgs_avail;
> +     int sync = NVM_SYNC_HARD;
> +     int err;
> +     int i;
> +
> +     /* Note that OS pages are typically mapped to flash page sectors, which
> +      * are 4K; a flash page might be formed of several sectors. Also,
> +      * controllers typically program flash pages across multiple planes.
> +      * This is the flash programing granurality, and the reason behind the
> +      * sync strategy performed in this write thread.
> +      */
> +try:
> +     spin_lock(&rlun->parent->lock);
> +     list_for_each_entry(rblk, &rlun->open_list, list) {
> +             if (!spin_trylock(&rblk->w_buf.w_lock))
> +                     continue;
> +
> +             /* If the write thread has already submitted all I/Os in the
> +              * write buffer for this block ignore that the block is in the
> +              * open list; it is on its way to the closed list. This enables
> +              * us to avoid taking a lock on the list.
> +              */
> +             if (unlikely(rblk->w_buf.cur_subm == rblk->w_buf.nentries)) {
> +                     spin_unlock(&rblk->w_buf.w_lock);
> +                     spin_unlock(&rlun->parent->lock);
> +                     schedule();
> +                     goto try;
> +             }
> +             pgs_avail = rblk->w_buf.cur_mem - rblk->w_buf.cur_subm;
> +
> +             switch (sync) {
> +             case NVM_SYNC_SOFT:
> +                     pgs_to_sync = (pgs_avail >= rrpc->max_write_pgs) ?
> +                                     rrpc->max_write_pgs : 0;
> +                     break;
> +             case NVM_SYNC_HARD:
> +                     if (pgs_avail >= rrpc->max_write_pgs)
> +                             pgs_to_sync = rrpc->max_write_pgs;
> +                     else if (pgs_avail >= rrpc->min_write_pgs)
> +                             pgs_to_sync = rrpc->min_write_pgs *
> +                                     (pgs_avail / rrpc->min_write_pgs);
> +                     else
> +                             pgs_to_sync = pgs_avail; /* TODO: ADD PADDING */
> +                     break;
> +             case NVM_SYNC_OPORT:
> +                     if (pgs_avail >= rrpc->max_write_pgs)
> +                             pgs_to_sync = rrpc->max_write_pgs;
> +                     else if (pgs_avail >= rrpc->min_write_pgs)
> +                             pgs_to_sync = rrpc->min_write_pgs *
> +                                     (pgs_avail / rrpc->min_write_pgs);
> +                     else
> +                             pgs_to_sync = 0;
> +             }
> +
> +             if (pgs_to_sync == 0) {
> +                     spin_unlock(&rblk->w_buf.w_lock);
> +                     continue;
> +             }
> +
> +             bio = bio_alloc(GFP_ATOMIC, pgs_to_sync);
> +             if (!bio) {
> +                     pr_err("nvm: rrpc: could not alloc write bio\n");
> +                     goto out1;
> +             }
> +
> +             rqd = mempool_alloc(rrpc->rq_pool, GFP_ATOMIC);
> +             if (!rqd) {
> +                     pr_err_ratelimited("rrpc: not able to create w req.");
> +                     goto out2;
> +             }
> +             rqd->metadata = NULL;
> +
> +             brrqd = mempool_alloc(rrpc->m_rrq_pool, GFP_ATOMIC);
> +             if (!brrqd) {
> +                     pr_err_ratelimited("rrpc: not able to create w rea.");
> +                     goto out3;
> +             }
> +
> +             bio->bi_iter.bi_sector = 0; /* artificial bio */
> +             bio->bi_rw = WRITE;
> +
> +             rqd->opcode = NVM_OP_HBWRITE;
> +             rqd->bio = bio;
> +             rqd->ins = &rrpc->instance;
> +             rqd->nr_pages = pgs_to_sync;
> +             rqd->priv = brrqd;
> +
> +             if (pgs_to_sync == 1) {
> +                     rrqd = rblk->w_buf.subm->rrqd;
> +                     addr = rblk->w_buf.subm->addr;
> +                     rqd->flags = rblk->w_buf.subm->flags;
> +                     data = rblk->w_buf.subm->data;
> +
> +                     err = rrpc_alloc_page_in_bio(rrpc, bio, data);
> +                     if (err) {
> +                             pr_err("rrpc: cannot allocate page in bio\n");
> +                             goto out4;
> +                     }
> +
> +                     /* TODO: This address can be skipped */
> +                     if (addr->addr == ADDR_EMPTY)
> +                             pr_err_ratelimited("rrpc: submitting empty rq");
> +
> +                     rqd->ppa_addr = rrpc_ppa_to_gaddr(dev, addr->addr);
> +
> +                     brrqd[0].rrqd = rrqd;
> +                     brrqd[0].addr = addr;
> +
> +                     rblk->w_buf.subm++;
> +                     rblk->w_buf.cur_subm++;
> +
> +                     goto submit_io;
> +             }
> +
> +             /* This bio will contain several pppas */
> +             rqd->ppa_list = nvm_dev_dma_alloc(rrpc->dev, GFP_ATOMIC,
> +                                                     &rqd->dma_ppa_list);
> +             if (!rqd->ppa_list) {
> +                     pr_err("rrpc: not able to allocate ppa list\n");
> +                     goto out4;
> +             }
> +
> +             for (i = 0; i < pgs_to_sync; i++) {
> +                     rrqd = rblk->w_buf.subm->rrqd;
> +                     addr = rblk->w_buf.subm->addr;
> +                     rqd->flags = rblk->w_buf.subm->flags;
> +                     data = rblk->w_buf.subm->data;
> +
> +                     err = rrpc_alloc_page_in_bio(rrpc, bio, data);
> +                     if (err) {
> +                             pr_err("rrpc: cannot allocate page in bio\n");
> +                             goto out5;
> +                     }
> +
> +                     /* TODO: This address can be skipped */
> +                     if (addr->addr == ADDR_EMPTY)
> +                             pr_err_ratelimited("rrpc: submitting empty rq");
> +
> +                     rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, addr->addr);
> +
> +                     brrqd[i].rrqd = rrqd;
> +                     brrqd[i].addr = addr;
> +
> +                     rblk->w_buf.subm++;
> +                     rblk->w_buf.cur_subm++;
> +             }
> +
> +submit_io:
> +             WARN_ON(rblk->w_buf.cur_subm > rblk->w_buf.nentries);
> +
> +             spin_unlock(&rblk->w_buf.w_lock);
> +
> +             err = nvm_submit_io(dev, rqd);
> +             if (err) {
> +                     pr_err("rrpc: I/O submission failed: %d\n", err);
> +                     mempool_free(rqd, rrpc->rq_pool);
> +                     bio_put(bio);
> +             }
> +     }
> +
> +     spin_unlock(&rlun->parent->lock);
> +     return;
> +
> +out5:
> +     nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
> +out4:
> +     mempool_free(brrqd, rrpc->m_rrq_pool);
> +out3:
> +     mempool_free(rqd, rrpc->rq_pool);
> +out2:
> +     bio_put(bio);
> +out1:
> +     spin_unlock(&rblk->w_buf.w_lock);
> +     spin_unlock(&rlun->parent->lock);
> +}
> +

The function can at least be split up in two parts. The what to sync and
the actual write. The ret* labels should be replaced with more sane names.

>  static void rrpc_requeue(struct work_struct *work)
>  {
>       struct rrpc *rrpc = container_of(work, struct rrpc, ws_requeue);
> @@ -978,7 +1436,7 @@ static void rrpc_gc_free(struct rrpc *rrpc)
>  
>  static int rrpc_gc_init(struct rrpc *rrpc)
>  {
> -     rrpc->krqd_wq = alloc_workqueue("rrpc-lun", WQ_MEM_RECLAIM|WQ_UNBOUND,
> +     rrpc->krqd_wq = alloc_workqueue("rrpc-lun", WQ_MEM_RECLAIM | WQ_UNBOUND,

No need for reformatting

>                                                               rrpc->nr_luns);
>       if (!rrpc->krqd_wq)
>               return -ENOMEM;
> @@ -1080,6 +1538,8 @@ static int rrpc_map_init(struct rrpc *rrpc)
>  
>  static int rrpc_core_init(struct rrpc *rrpc)
>  {
> +     struct nvm_dev *dev = rrpc->dev;
> +
>       down_write(&rrpc_lock);
>       if (!rrpc_gcb_cache) {
>               rrpc_gcb_cache = kmem_cache_create("rrpc_gcb",
> @@ -1089,14 +1549,70 @@ static int rrpc_core_init(struct rrpc *rrpc)
>                       return -ENOMEM;
>               }
>  
> -             rrpc_rq_cache = kmem_cache_create("rrpc_rq",
> -                             sizeof(struct nvm_rq) + sizeof(struct rrpc_rq),
> -                             0, 0, NULL);
> +             rrpc_rq_cache = kmem_cache_create("nvm_rq",
> +                                     sizeof(struct nvm_rq), 0, 0, NULL);
>               if (!rrpc_rq_cache) {
>                       kmem_cache_destroy(rrpc_gcb_cache);
>                       up_write(&rrpc_lock);
>                       return -ENOMEM;
>               }
> +
> +             rrpc_rrq_cache = kmem_cache_create("rrpc_rrq",
> +                                     sizeof(struct rrpc_rq), 0, 0, NULL);
> +             if (!rrpc_rrq_cache) {
> +                     kmem_cache_destroy(rrpc_gcb_cache);
> +                     kmem_cache_destroy(rrpc_rq_cache);
> +                     up_write(&rrpc_lock);
> +                     return -ENOMEM;
> +             }
> +
> +             rrpc_buf_rrq_cache = kmem_cache_create("rrpc_m_rrq",
> +                     rrpc->max_write_pgs * sizeof(struct rrpc_buf_rq),
> +                     0, 0, NULL);
> +             if (!rrpc_buf_rrq_cache) {
> +                     kmem_cache_destroy(rrpc_gcb_cache);
> +                     kmem_cache_destroy(rrpc_rq_cache);
> +                     kmem_cache_destroy(rrpc_rrq_cache);
> +                     up_write(&rrpc_lock);
> +                     return -ENOMEM;
> +             }
> +     }
> +
> +     /* we assume that sec->sec_size is the same as the page size exposed by
> +      * rrpc (4KB). We need extra logic otherwise
> +      */
> +     if (!rrpc_block_cache) {
> +             /* Write buffer: Allocate all buffer (for all block) at once. We
> +              * avoid having to allocate a memory from the pool for each IO
> +              * at the cost pre-allocating memory for the whole block when a
> +              * new block is allocated from the media manager.
> +              */
> +             rrpc_wb_cache = kmem_cache_create("nvm_wb",
> +                     dev->pgs_per_blk * dev->sec_per_pg * dev->sec_size,
> +                     0, 0, NULL);

Seems a bit excessive to allocate a mempool for this. Why not do a
vmalloc for when the memory is needed? The normal case is 4MB? per
entry, and the cache creates 8 of them in the pool, using 64MB out of
the gate. We are not in a hot path at this point in time, so lets not
waste the resource when we might not use them.

> +             if (!rrpc_wb_cache) {
> +                     kmem_cache_destroy(rrpc_gcb_cache);
> +                     kmem_cache_destroy(rrpc_rq_cache);
> +                     kmem_cache_destroy(rrpc_rrq_cache);
> +                     kmem_cache_destroy(rrpc_buf_rrq_cache);
> +                     up_write(&rrpc_lock);
> +                     return -ENOMEM;
> +             }
> +
> +             /* Write buffer entries */
> +             rrpc_block_cache = kmem_cache_create("nvm_entry",
> +                     dev->pgs_per_blk * dev->sec_per_pg *
> +                     sizeof(struct buf_entry),
> +                     0, 0, NULL);
> +             if (!rrpc_block_cache) {
> +                     kmem_cache_destroy(rrpc_gcb_cache);
> +                     kmem_cache_destroy(rrpc_rq_cache);
> +                     kmem_cache_destroy(rrpc_rrq_cache);
> +                     kmem_cache_destroy(rrpc_buf_rrq_cache);
> +                     kmem_cache_destroy(rrpc_wb_cache);
> +                     up_write(&rrpc_lock);
> +                     return -ENOMEM;
> +             }
>       }
>       up_write(&rrpc_lock);
>  
> @@ -1113,17 +1629,45 @@ static int rrpc_core_init(struct rrpc *rrpc)
>       if (!rrpc->rq_pool)
>               return -ENOMEM;
>  
> +     rrpc->rrq_pool = mempool_create_slab_pool(64, rrpc_rrq_cache);
> +     if (!rrpc->rrq_pool)
> +             return -ENOMEM;
> +
> +     rrpc->m_rrq_pool = mempool_create_slab_pool(64, rrpc_buf_rrq_cache);
> +     if (!rrpc->m_rrq_pool)
> +             return -ENOMEM;
> +
> +     rrpc->block_pool = mempool_create_slab_pool(8, rrpc_block_cache);
> +     if (!rrpc->block_pool)
> +             return -ENOMEM;
> +
> +     rrpc->write_buf_pool = mempool_create_slab_pool(8, rrpc_wb_cache);
> +     if (!rrpc->write_buf_pool)
> +             return -ENOMEM;

Alot of pools. I hope the nvm_rq, rrpc_rq, and rrpc_buf_rq consolidation
can move it back to just a nvm_rq entry)

> +
>       spin_lock_init(&rrpc->inflights.lock);
>       INIT_LIST_HEAD(&rrpc->inflights.reqs);
>  
> +     rrpc->kw_wq = alloc_workqueue("rrpc-writer",
> +                             WQ_MEM_RECLAIM | WQ_UNBOUND, rrpc->nr_luns);
> +     if (!rrpc->kw_wq)
> +             return -ENOMEM;
> +
>       return 0;
>  }
>  
>  static void rrpc_core_free(struct rrpc *rrpc)
>  {
> +     if (rrpc->kw_wq)
> +             destroy_workqueue(rrpc->kw_wq);
> +
>       mempool_destroy(rrpc->page_pool);
>       mempool_destroy(rrpc->gcb_pool);
> +     mempool_destroy(rrpc->m_rrq_pool);
> +     mempool_destroy(rrpc->rrq_pool);
>       mempool_destroy(rrpc->rq_pool);
> +     mempool_destroy(rrpc->block_pool);
> +     mempool_destroy(rrpc->write_buf_pool);
>  }
>  
>  static void rrpc_luns_free(struct rrpc *rrpc)
> @@ -1164,6 +1708,8 @@ static int rrpc_luns_init(struct rrpc *rrpc, int 
> lun_begin, int lun_end)
>               INIT_LIST_HEAD(&rlun->open_list);
>               INIT_LIST_HEAD(&rlun->closed_list);
>  
> +             INIT_WORK(&rlun->ws_writer, rrpc_submit_write);
> +
>               INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
>               spin_lock_init(&rlun->lock);
>  
> @@ -1209,6 +1755,7 @@ static void rrpc_exit(void *private)
>  
>       flush_workqueue(rrpc->krqd_wq);
>       flush_workqueue(rrpc->kgc_wq);
> +     /* flush_workqueue(rrpc->kw_wq); */ /* TODO: Implement flush + padding*/

This can come in an additional patch.

>  
>       rrpc_free(rrpc);
>  }
> diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
> index 868e91a..6e188b4 100644
> --- a/drivers/lightnvm/rrpc.h
> +++ b/drivers/lightnvm/rrpc.h
> @@ -49,17 +49,67 @@ struct rrpc_inflight_rq {
>  struct rrpc_rq {
>       struct rrpc_inflight_rq inflight_rq;
>       struct rrpc_addr *addr;
> +     int nr_pages;
> +
> +     struct kref refs;

I think what you really want to do here is to add the ref counter to
rrpc_inflight_addr and then have that maintain it to when to free.
(also, name it kref)

> +};
> +
> +struct rrpc_buf_rq {
> +     struct rrpc_addr *addr;
> +     struct rrpc_rq *rrqd;

I'm not sure why we keep a rrpc_buf_rq->addr and rrpc_rq->addr?

> +};
> +
> +/* Sync strategies from write buffer to media */
> +enum {
> +     NVM_SYNC_SOFT   = 0x0,          /* Only submit at max_write_pgs
> +                                      * supported by the device. Typically 64
> +                                      * pages (256k).
> +                                      */
> +     NVM_SYNC_HARD   = 0x1,          /* Submit the whole buffer. Add padding
> +                                      * if necessary to respect the device's
> +                                      * min_write_pgs.
> +                                      */
> +     NVM_SYNC_OPORT  = 0x2,          /* Submit what we can, always respecting
> +                                      * the device's min_write_pgs.
> +                                      */

That is great. This should properly be exposed as a sysfs option later.
> +};
> +
> +struct buf_entry {
> +     struct rrpc_rq *rrqd;
> +     void *data;
> +     struct rrpc_addr *addr;
>       unsigned long flags;
>  };
>  
> +struct rrpc_w_buf {
> +     struct buf_entry *entries;      /* Entries */
> +     struct buf_entry *mem;          /* Points to the next writable entry */
> +     struct buf_entry *subm;         /* Points to the last submitted entry */
> +     int cur_mem;                    /* Current memory entry. Follows mem */
> +     int cur_subm;                   /* Entries have been submitted to dev */
> +     int nentries;                   /* Number of entries in write buffer */

nr_entries?

> +
> +     void *data;                     /* Actual data */
> +     unsigned long *sync_bitmap;     /* Bitmap representing physical
> +                                      * addresses that have been synced to
> +                                      * the media
> +                                      */
> +
> +     atomic_t refs;

kref? semaphore?

> +
> +     spinlock_t w_lock;
> +     spinlock_t s_lock;

is it short for submission lock?

> +};
> +
>  struct rrpc_block {
>       struct nvm_block *parent;
>       struct rrpc_lun *rlun;
>       struct list_head prio;
>       struct list_head list;
> +     struct rrpc_w_buf w_buf;
>  
>  #define MAX_INVALID_PAGES_STORAGE 8
> -     /* Bitmap for invalid page intries */
> +     /* Bitmap for invalid page entries */
>       unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE];
>       /* points to the next writable page within a block */
>       unsigned int next_page;
> @@ -67,7 +117,6 @@ struct rrpc_block {
>       unsigned int nr_invalid_pages;
>  
>       spinlock_t lock;
> -     atomic_t data_cmnt_size; /* data pages committed to stable storage */
>  };
>  
>  struct rrpc_lun {
> @@ -86,6 +135,7 @@ struct rrpc_lun {
>                                        */
>  
>       struct work_struct ws_gc;
> +     struct work_struct ws_writer;
>  
>       spinlock_t lock;
>  };
> @@ -136,10 +186,15 @@ struct rrpc {
>       mempool_t *page_pool;
>       mempool_t *gcb_pool;
>       mempool_t *rq_pool;
> +     mempool_t *rrq_pool;
> +     mempool_t *m_rrq_pool;
> +     mempool_t *block_pool;
> +     mempool_t *write_buf_pool;
>  
>       struct timer_list gc_timer;
>       struct workqueue_struct *krqd_wq;
>       struct workqueue_struct *kgc_wq;
> +     struct workqueue_struct *kw_wq;
>  };
>  
>  struct rrpc_block_gc {
> @@ -181,7 +236,7 @@ static inline int request_intersects(struct 
> rrpc_inflight_rq *r,
>  }
>  
>  static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
> -                          unsigned pages, struct rrpc_inflight_rq *r)
> +                             unsigned pages, struct rrpc_inflight_rq *r)
>  {
>       sector_t laddr_end = laddr + pages - 1;
>       struct rrpc_inflight_rq *rtmp;
> @@ -206,27 +261,26 @@ static int __rrpc_lock_laddr(struct rrpc *rrpc, 
> sector_t laddr,
>  }
>  
>  static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
> -                              unsigned pages,
> -                              struct rrpc_inflight_rq *r)
> +                             unsigned pages,
> +                             struct rrpc_inflight_rq *r)
>  {
>       BUG_ON((laddr + pages) > rrpc->nr_sects);
>  
>       return __rrpc_lock_laddr(rrpc, laddr, pages, r);
>  }
>  
> -static inline struct rrpc_inflight_rq *rrpc_get_inflight_rq(struct nvm_rq 
> *rqd)
> +static inline struct rrpc_inflight_rq
> +                             *rrpc_get_inflight_rq(struct rrpc_rq *rrqd)
>  {
> -     struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
> -
>       return &rrqd->inflight_rq;
>  }
>  
>  static inline int rrpc_lock_rq(struct rrpc *rrpc, struct bio *bio,
> -                                                     struct nvm_rq *rqd)
> +                                                     struct rrpc_rq *rrqd)
>  {
>       sector_t laddr = rrpc_get_laddr(bio);
>       unsigned int pages = rrpc_get_pages(bio);
> -     struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
> +     struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rrqd);
>  
>       return rrpc_lock_laddr(rrpc, laddr, pages, r);
>  }
> @@ -241,12 +295,12 @@ static inline void rrpc_unlock_laddr(struct rrpc *rrpc,
>       spin_unlock_irqrestore(&rrpc->inflights.lock, flags);
>  }
>  
> -static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd)
> +static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct rrpc_rq *rrqd)
>  {
> -     struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
> -     uint8_t pages = rqd->nr_pages;
> +     struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rrqd);
> +     uint8_t nr_pages = rrqd->nr_pages;

The cleanups can be moved to another patch.

>  
> -     BUG_ON((r->l_start + pages) > rrpc->nr_sects);
> +     BUG_ON((r->l_start + nr_pages) > rrpc->nr_sects);
>  
>       rrpc_unlock_laddr(rrpc, r);
>  }
> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
> index b94f2d5..eda9743 100644
> --- a/include/linux/lightnvm.h
> +++ b/include/linux/lightnvm.h
> @@ -231,6 +231,8 @@ struct nvm_rq {
>       void *metadata;
>       dma_addr_t dma_metadata;
>  
> +     void *priv;
> +
>       struct completion *wait;
>       nvm_end_io_fn *end_io;
>  
> @@ -243,12 +245,12 @@ struct nvm_rq {
>  
>  static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu)
>  {
> -     return pdu - sizeof(struct nvm_rq);
> +     return container_of(pdu, struct nvm_rq, priv);
>  }
>  
>  static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata)
>  {
> -     return rqdata + 1;
> +     return rqdata->priv;
>  }

I think the priv is unnecessary. This should be able to be expressed in
a single structure. E.g. have the rrpc_rq be a union of the data types
you either need when processing a read or a write. It seems that they no
longer have much in common.

Re: [RFC 2/4] lightnvm: add write buffering for rrpc

Reply via email to