[RFC] zram: support page-based parallel write

Minchan Kim Tue, 06 Sep 2016 00:25:18 -0700

zram supports stream-based parallel compression. IOW, it can compress
in parallel on multi-core system only if there are *several* streams
in the system because each stream can be compressed on each CPUs.


However, if there is *a* stream in the system, it cannot be compressed
in parallel although the system supports multiple CPUs.
This patch enables parallel compression using multiple CPUs even though
there is a single stream in the system.

When I tested benchmark, random read which is important for zram-swap
case was worse so it supports async-write at the moment. Later, We might
support async-read, I hope.

It is useful for single stream scenario.

I tested on 4-CPU ARM machine.

FIO job=1 benchmark result

Before:

seq-write: (groupid=0, jobs=1): err= 0: pid=2971: Tue Sep  6 08:14:06 2016
  write: io=163840KB, bw=28239KB/s, iops=441, runt=  5802msec
rand-write: (groupid=1, jobs=1): err= 0: pid=2977: Tue Sep  6 08:14:06 2016
  write: io=163840KB, bw=22300KB/s, iops=5575, runt=  7347msec
seq-read: (groupid=2, jobs=1): err= 0: pid=2983: Tue Sep  6 08:14:06 2016
  read : io=163840KB, bw=66928KB/s, iops=1045, runt=  2448msec
rand-read: (groupid=3, jobs=1): err= 0: pid=2984: Tue Sep  6 08:14:06 2016
  read : io=163840KB, bw=40980KB/s, iops=10245, runt=  3998msec
mixed-seq: (groupid=4, jobs=1): err= 0: pid=2985: Tue Sep  6 08:14:06 2016
  read : io=82240KB, bw=18308KB/s, iops=286, runt=  4492msec
  write: io=81600KB, bw=18166KB/s, iops=283, runt=  4492msec
mixed-rand: (groupid=5, jobs=1): err= 0: pid=2989: Tue Sep  6 08:14:06 2016
  read : io=84120KB, bw=14771KB/s, iops=3692, runt=  5695msec
  write: io=79720KB, bw=13998KB/s, iops=3499, runt=  5695msec

After:

  write: io=163840KB, bw=60547KB/s, iops=946, runt=  2706msec
rand-write: (groupid=1, jobs=1): err= 0: pid=2940: Tue Sep  6 08:13:04 2016
  write: io=163840KB, bw=39337KB/s, iops=9834, runt=  4165msec
seq-read: (groupid=2, jobs=1): err= 0: pid=2946: Tue Sep  6 08:13:04 2016
  read : io=163840KB, bw=66225KB/s, iops=1034, runt=  2474msec
rand-read: (groupid=3, jobs=1): err= 0: pid=2947: Tue Sep  6 08:13:04 2016
  read : io=163840KB, bw=40970KB/s, iops=10242, runt=  3999msec
mixed-seq: (groupid=4, jobs=1): err= 0: pid=2948: Tue Sep  6 08:13:04 2016
  read : io=82240KB, bw=31963KB/s, iops=499, runt=  2573msec
  write: io=81600KB, bw=31714KB/s, iops=495, runt=  2573msec
mixed-rand: (groupid=5, jobs=1): err= 0: pid=2952: Tue Sep  6 08:13:04 2016
  read : io=84120KB, bw=20192KB/s, iops=5048, runt=  4166msec
  write: io=79720KB, bw=19136KB/s, iops=4783, runt=  4166msec

So, write/mixed-rw is 2 times faster.

I tested fio 4 jobs to catch up regression of full stream workloads
and result is not regression but enhanced two times.

FIO job=4 benchmark result
Before:
seq-write: (groupid=0, jobs=4): err= 0: pid=3060: Tue Sep  6 08:22:13 2016
  write: io=655360KB, bw=114834KB/s, iops=1794, runt=  5707msec
rand-write: (groupid=1, jobs=4): err= 0: pid=3071: Tue Sep  6 08:22:13 2016
  write: io=655360KB, bw=95520KB/s, iops=23879, runt=  6861msec
seq-read: (groupid=2, jobs=4): err= 0: pid=3083: Tue Sep  6 08:22:13 2016
  read : io=655360KB, bw=533247KB/s, iops=8331, runt=  1229msec
rand-read: (groupid=3, jobs=4): err= 0: pid=3087: Tue Sep  6 08:22:13 2016
  read : io=655360KB, bw=295874KB/s, iops=73968, runt=  2215msec
mixed-seq: (groupid=4, jobs=4): err= 0: pid=3091: Tue Sep  6 08:22:13 2016
  read : io=326272KB, bw=85861KB/s, iops=1341, runt=  3800msec
  write: io=329088KB, bw=86602KB/s, iops=1353, runt=  3800msec
mixed-rand: (groupid=5, jobs=4): err= 0: pid=3101: Tue Sep  6 08:22:13 2016
  read : io=326296KB, bw=49521KB/s, iops=12380, runt=  6589msec
  write: io=329064KB, bw=49941KB/s, iops=12485, runt=  6589msec

After:
seq-write: (groupid=0, jobs=4): err= 0: pid=3129: Tue Sep  6 08:23:02 2016
  write: io=655360KB, bw=246098KB/s, iops=3845, runt=  2663msec
rand-write: (groupid=1, jobs=4): err= 0: pid=3141: Tue Sep  6 08:23:02 2016
  write: io=655360KB, bw=179158KB/s, iops=44789, runt=  3658msec
seq-read: (groupid=2, jobs=4): err= 0: pid=3154: Tue Sep  6 08:23:02 2016
  read : io=655360KB, bw=560616KB/s, iops=8759, runt=  1169msec
rand-read: (groupid=3, jobs=4): err= 0: pid=3158: Tue Sep  6 08:23:02 2016
  read : io=655360KB, bw=290368KB/s, iops=72591, runt=  2257msec
mixed-seq: (groupid=4, jobs=4): err= 0: pid=3162: Tue Sep  6 08:23:02 2016
  read : io=326272KB, bw=196905KB/s, iops=3076, runt=  1657msec
  write: io=329088KB, bw=198605KB/s, iops=3103, runt=  1657msec
mixed-rand: (groupid=5, jobs=4): err= 0: pid=3172: Tue Sep  6 08:23:02 2016
  read : io=326296KB, bw=89152KB/s, iops=22287, runt=  3660msec
  write: io=329064KB, bw=89908KB/s, iops=22477, runt=  3660msec

Signed-off-by: Minchan Kim <minc...@kernel.org>
---

It's a RFC so intentionally, I didn't add any docuement about use_aio
because it would waste my time to spend a document if we change mind
after discussion(e.g., We might go default IO model as aio so user never
need to know use_aio).

Other thing I should do is that we should change the number of zram thread
with changing the number of online CPU. It would be trivial.

 drivers/block/zram/zram_drv.c | 558 +++++++++++++++++++++++++++++++++++++-----
 drivers/block/zram/zram_drv.h |   1 +
 2 files changed, 504 insertions(+), 55 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 04365b17ee67..feb6a4195c2f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
+#include <linux/kthread.h>
 #include <linux/err.h>
 #include <linux/idr.h>
 #include <linux/sysfs.h>
@@ -366,6 +367,46 @@ static ssize_t comp_algorithm_store(struct device *dev,
        return len;
 }
 
+static ssize_t use_aio_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       bool val;
+       struct zram *zram = dev_to_zram(dev);
+
+       down_read(&zram->init_lock);
+       val = zram->use_aio;
+       up_read(&zram->init_lock);
+
+       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t use_aio_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t len)
+{
+       int ret;
+       u16 do_async;
+       struct zram *zram  = dev_to_zram(dev);
+
+       ret = kstrtou16(buf, 10, &do_async);
+       if (ret)
+               return ret;
+
+       down_write(&zram->init_lock);
+       if (init_done(zram)) {
+               up_write(&zram->init_lock);
+               pr_info("Can't change for initialized device\n");
+               return -EBUSY;
+       }
+
+       if (do_async)
+               zram->use_aio = true;
+       else
+               zram->use_aio = false;
+       up_write(&zram->init_lock);
+
+       return len;
+}
+
 static ssize_t compact_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t len)
 {
@@ -872,7 +913,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec 
*bvec, u32 index,
        return ret;
 }
 
-static void __zram_make_request(struct zram *zram, struct bio *bio)
+static void __zram_make_sync_request(struct zram *zram, struct bio *bio)
 {
        int offset;
        u32 index;
@@ -883,12 +924,6 @@ static void __zram_make_request(struct zram *zram, struct 
bio *bio)
        offset = (bio->bi_iter.bi_sector &
                  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
 
-       if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
-               zram_bio_discard(zram, index, offset, bio);
-               bio_endio(bio);
-               return;
-       }
-
        bio_for_each_segment(bvec, bio, iter) {
                int max_transfer_size = PAGE_SIZE - offset;
 
@@ -921,95 +956,502 @@ static void __zram_make_request(struct zram *zram, 
struct bio *bio)
        }
 
        bio_endio(bio);
+       zram_meta_put(zram);
        return;
 
 out:
        bio_io_error(bio);
+       zram_meta_put(zram);
 }
 
-/*
- * Handler function for all zram I/O requests.
- */
-static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
+static int zram_rw_sync_page(struct block_device *bdev, struct zram *zram,
+                               struct bio_vec *bv, u32 index,
+                               int offset, bool is_write)
 {
-       struct zram *zram = queue->queuedata;
+       int err;
 
-       if (unlikely(!zram_meta_get(zram)))
-               goto error;
+       err = zram_bvec_rw(zram, bv, index, offset, is_write);
+       /*
+        * If I/O fails, just return error(ie, non-zero) without
+        * calling page_endio.
+        * It causes resubmit the I/O with bio request by upper functions
+        * of rw_page(e.g., swap_readpage, __swap_writepage) and
+        * bio->bi_end_io does things to handle the error
+        * (e.g., SetPageError, set_page_dirty and extra works).
+        */
+       if (err == 0)
+               page_endio(bv->bv_page, is_write, 0);
 
-       blk_queue_split(queue, &bio, queue->bio_split);
+       zram_meta_put(zram);
+       return err;
+}
 
-       if (!valid_io_request(zram, bio->bi_iter.bi_sector,
-                                       bio->bi_iter.bi_size)) {
-               atomic64_inc(&zram->stats.invalid_io);
-               goto put_zram;
+const int NR_BATCH_PAGES = 64;
+
+struct zram_worker {
+       struct task_struct *task;
+       struct list_head list;
+};
+
+struct zram_workers {
+       spinlock_t req_lock;
+       struct list_head req_list;
+       unsigned int nr_req;
+       struct list_head worker_list;
+       wait_queue_head_t req_wait;
+       int nr_running;
+} workers;
+
+struct bio_request {
+       struct bio *bio;
+       atomic_t nr_pages;
+};
+
+struct page_request {
+       struct zram *zram;
+       struct bio_request *bio_req;
+       struct bio_vec bvec;
+       u32 index;
+       int offset;
+       bool write;
+       struct list_head list;
+};
+
+static void worker_wake_up(void)
+{
+       if (workers.nr_running * NR_BATCH_PAGES < workers.nr_req) {
+               int nr_wakeup = (workers.nr_req + NR_BATCH_PAGES) /
+                               NR_BATCH_PAGES - workers.nr_running;
+
+               WARN_ON(!nr_wakeup);
+               wake_up_nr(&workers.req_wait, nr_wakeup);
        }
+}
 
-       __zram_make_request(zram, bio);
-       zram_meta_put(zram);
-       return BLK_QC_T_NONE;
-put_zram:
-       zram_meta_put(zram);
-error:
-       bio_io_error(bio);
-       return BLK_QC_T_NONE;
+static void zram_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+       spin_lock_irq(&workers.req_lock);
+       if (workers.nr_req)
+               worker_wake_up();
+       spin_unlock_irq(&workers.req_lock);
+       kfree(cb);
 }
 
-static void zram_slot_free_notify(struct block_device *bdev,
-                               unsigned long index)
+static int zram_check_plugged(void)
+{
+       return !!blk_check_plugged(zram_unplug, NULL,
+                       sizeof(struct blk_plug_cb));
+}
+
+int queue_page_request(struct zram *zram, struct bio_vec *bvec, u32 index,
+                       int offset, bool write)
+{
+       struct page_request *page_req = kmalloc(sizeof(*page_req), GFP_NOIO);
+
+       if (!page_req)
+               return -ENOMEM;
+
+       page_req->bio_req = NULL;
+       page_req->zram = zram;
+       page_req->bvec = *bvec;
+       page_req->index = index;
+       page_req->offset = offset;
+       page_req->write = write;
+
+       spin_lock(&workers.req_lock);
+       list_add(&page_req->list, &workers.req_list);
+       workers.nr_req += 1;
+       if (!zram_check_plugged())
+               worker_wake_up();
+       spin_unlock(&workers.req_lock);
+
+
+       return 0;
+}
+
+int queue_page_request_list(struct zram *zram, struct bio_request *bio_req,
+                       struct bio_vec *bvec, u32 index, int offset,
+                       bool write, struct list_head *page_list)
+{
+       struct page_request *page_req = kmalloc(sizeof(*page_req), GFP_NOIO);
+
+       if (!page_req) {
+               while (!list_empty(page_list)) {
+                       page_req = list_first_entry(page_list,
+                                       struct page_request, list);
+                       list_del(&page_req->list);
+                       kfree(page_req);
+               }
+
+               return -ENOMEM;
+       }
+
+       page_req->bio_req = bio_req;
+       atomic_inc(&bio_req->nr_pages);
+       page_req->zram = zram;
+       page_req->bvec = *bvec;
+       page_req->index = index;
+       page_req->offset = offset;
+       page_req->write = write;
+
+       list_add_tail(&page_req->list, page_list);
+
+       return 0;
+}
+
+/* Caller should hold on req_lock */
+static void get_page_requests(struct list_head *page_list)
+{
+       struct page_request *page_req;
+       struct bio_request *bio_req;
+       int nr_batch = NR_BATCH_PAGES;
+
+       while (nr_batch--) {
+               if  (list_empty(&workers.req_list))
+                       break;
+
+               page_req = list_first_entry(&workers.req_list,
+                                       struct page_request, list);
+               list_move(&page_req->list, page_list);
+               bio_req = page_req->bio_req;
+               workers.nr_req--;
+       }
+}
+
+static int page_request_rw(struct page_request *page_req)
+{
+       struct zram *zram = page_req->zram;
+
+       return zram_bvec_rw(zram, &page_req->bvec, page_req->index,
+                       page_req->offset, page_req->write);
+}
+
+static void run_worker(struct bio *bio, struct list_head *page_list,
+                       unsigned int nr_pages)
 {
+       WARN_ON(list_empty(page_list));
+
+       spin_lock(&workers.req_lock);
+       list_splice_tail(page_list, &workers.req_list);
+       workers.nr_req += nr_pages;
+       if (bio->bi_opf & REQ_SYNC || !zram_check_plugged())
+               worker_wake_up();
+       spin_unlock(&workers.req_lock);
+}
+
+static int __zram_make_async_request(struct zram *zram, struct bio *bio)
+{
+       int offset;
+       u32 index;
+       struct bio_vec bvec;
+       struct bvec_iter iter;
+       LIST_HEAD(page_list);
+       struct bio_request *bio_req;
+       unsigned int nr_pages = 0;
+       bool write = op_is_write(bio_op(bio));
+
+       index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
+       offset = (bio->bi_iter.bi_sector &
+                 (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+
+       bio_req = kmalloc(sizeof(*bio_req), GFP_NOIO);
+       if (!bio_req)
+               return 1;
+
+       /*
+        * Keep bi_vcnt to complete bio handling when all of pages
+        * in the bio are handled.
+        */
+       bio_req->bio = bio;
+       atomic_set(&bio_req->nr_pages, 0);
+
+       bio_for_each_segment(bvec, bio, iter) {
+               int max_transfer_size = PAGE_SIZE - offset;
+
+               if (bvec.bv_len > max_transfer_size) {
+                       /*
+                        * zram_bvec_rw() can only make operation on a single
+                        * zram page. Split the bio vector.
+                        */
+                       struct bio_vec bv;
+
+                       bv.bv_page = bvec.bv_page;
+                       bv.bv_len = max_transfer_size;
+                       bv.bv_offset = bvec.bv_offset;
+
+                       if (queue_page_request_list(zram, bio_req, &bv,
+                               index, offset, write, &page_list))
+                               goto out;
+                       nr_pages++;
+
+                       bv.bv_len = bvec.bv_len - max_transfer_size;
+                       bv.bv_offset += max_transfer_size;
+                       if (queue_page_request_list(zram, bio_req, &bv,
+                               index + 1, 0, write, &page_list))
+                               goto out;
+                       nr_pages++;
+               } else
+                       if (queue_page_request_list(zram, bio_req, &bvec,
+                               index, offset, write, &page_list))
+                               goto out;
+                       nr_pages++;
+
+               update_position(&index, &offset, &bvec);
+       }
+
+       run_worker(bio, &page_list, nr_pages);
+       return 0;
+
+out:
+       kfree(bio_req);
+
+       WARN_ON(!list_empty(&page_list));
+       return 1;
+}
+
+
+void page_requests_rw(struct list_head *page_list)
+{
+       struct page_request *page_req;
+       bool write;
+       struct page *page;
        struct zram *zram;
-       struct zram_meta *meta;
 
-       zram = bdev->bd_disk->private_data;
-       meta = zram->meta;
+       while (!list_empty(page_list)) {
+               bool free_bio = false;
+               struct bio_request *bio_req;
+               int err;
+
+               page_req = list_last_entry(page_list, struct page_request,
+                                       list);
+               write = page_req->write;
+               page = page_req->bvec.bv_page;
+               zram = page_req->zram;
+               bio_req = page_req->bio_req;
+               if (bio_req && atomic_dec_and_test(&bio_req->nr_pages))
+                       free_bio = true;
+               list_del(&page_req->list);
+
+               err = page_request_rw(page_req);
+               kfree(page_req);
+               /* page-based request */
+               if (!bio_req) {
+                       page_endio(page, write, err);
+                       zram_meta_put(zram);
+               /* bio-based request */
+               } else if (free_bio) {
+                       if (likely(!err))
+                               bio_endio(bio_req->bio);
+                       else
+                               bio_io_error(bio_req->bio);
+                       kfree(bio_req);
+                       zram_meta_put(zram);
+               }
 
-       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
-       zram_free_page(zram, index);
-       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-       atomic64_inc(&zram->stats.notify_free);
+       }
+}
+
+static int zram_thread(void *data)
+{
+       DEFINE_WAIT(wait);
+       LIST_HEAD(page_list);
+
+       spin_lock(&workers.req_lock);
+       workers.nr_running++;
+
+       while (1) {
+               if (kthread_should_stop()) {
+                       workers.nr_running--;
+                       spin_unlock(&workers.req_lock);
+                       break;
+               }
+
+               if (list_empty(&workers.req_list)) {
+                       prepare_to_wait_exclusive(&workers.req_wait, &wait,
+                                       TASK_INTERRUPTIBLE);
+                       workers.nr_running--;
+                       spin_unlock(&workers.req_lock);
+                       schedule();
+                       spin_lock(&workers.req_lock);
+                       workers.nr_running++;
+                       finish_wait(&workers.req_wait, &wait);
+                       continue;
+               }
+
+               get_page_requests(&page_list);
+               if (list_empty(&page_list))
+                       continue;
+
+               spin_unlock(&workers.req_lock);
+               page_requests_rw(&page_list);
+               WARN_ON(!list_empty(&page_list));
+               cond_resched();
+               spin_lock(&workers.req_lock);
+       }
+
+       return 0;
+}
+
+static void destroy_workers(void)
+{
+       struct zram_worker *worker;
+
+       while (!list_empty(&workers.worker_list)) {
+               worker = list_first_entry(&workers.worker_list,
+                               struct zram_worker,
+                               list);
+               kthread_stop(worker->task);
+               list_del(&worker->list);
+               kfree(worker);
+       }
+
+       WARN_ON(workers.nr_running);
+}
+
+static int create_workers(void)
+{
+       int i;
+       int nr_cpu = num_online_cpus();
+       struct zram_worker *worker;
+
+       INIT_LIST_HEAD(&workers.worker_list);
+       INIT_LIST_HEAD(&workers.req_list);
+       spin_lock_init(&workers.req_lock);
+       init_waitqueue_head(&workers.req_wait);
+
+       for (i = 0; i < nr_cpu; i++) {
+               worker = kmalloc(sizeof(*worker), GFP_KERNEL);
+               if (!worker)
+                       goto error;
+
+               worker->task = kthread_run(zram_thread, NULL, "zramd-%d", i);
+               if (IS_ERR(worker->task)) {
+                       kfree(worker);
+                       goto error;
+               }
+
+               list_add(&worker->list, &workers.worker_list);
+       }
+
+       return 0;
+
+error:
+       destroy_workers();
+       return 1;
+}
+
+static int zram_rw_async_page(struct zram *zram,
+                       struct bio_vec *bv, u32 index, int offset,
+                       bool is_write)
+{
+
+       return queue_page_request(zram, bv, index, offset, is_write);
 }
 
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
                       struct page *page, bool is_write)
 {
-       int offset, err = -EIO;
-       u32 index;
+       int err = -EIO;
        struct zram *zram;
+       int offset;
+       u32 index;
        struct bio_vec bv;
 
        zram = bdev->bd_disk->private_data;
        if (unlikely(!zram_meta_get(zram)))
-               goto out;
+               return err;
 
        if (!valid_io_request(zram, sector, PAGE_SIZE)) {
                atomic64_inc(&zram->stats.invalid_io);
-               err = -EINVAL;
-               goto put_zram;
+               zram_meta_put(zram);
+               return -EINVAL;
        }
 
+
        index = sector >> SECTORS_PER_PAGE_SHIFT;
-       offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
+       offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
 
        bv.bv_page = page;
        bv.bv_len = PAGE_SIZE;
        bv.bv_offset = 0;
 
-       err = zram_bvec_rw(zram, &bv, index, offset, is_write);
-put_zram:
-       zram_meta_put(zram);
-out:
+       if (!zram->use_aio || !is_write) {
+               err = zram_rw_sync_page(bdev, zram, &bv, index, offset,
+                                       is_write);
+       } else {
+               err = zram_rw_async_page(zram, &bv, index, offset, is_write);
+               if (err)
+                       err = zram_rw_sync_page(bdev, zram, &bv, index,
+                                       offset, is_write);
+       }
+
+       return err;
+}
+
+
+static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
+{
+       struct zram *zram = queue->queuedata;
+
        /*
-        * If I/O fails, just return error(ie, non-zero) without
-        * calling page_endio.
-        * It causes resubmit the I/O with bio request by upper functions
-        * of rw_page(e.g., swap_readpage, __swap_writepage) and
-        * bio->bi_end_io does things to handle the error
-        * (e.g., SetPageError, set_page_dirty and extra works).
+        * request handler should take care of reference count and
+        * bio_endio.
         */
-       if (err == 0)
-               page_endio(page, is_write, 0);
-       return err;
+       if (unlikely(!zram_meta_get(zram)))
+               goto error;
+
+       blk_queue_split(queue, &bio, queue->bio_split);
+
+       if (!valid_io_request(zram, bio->bi_iter.bi_sector,
+                                       bio->bi_iter.bi_size)) {
+               atomic64_inc(&zram->stats.invalid_io);
+               goto fail;
+       }
+
+       if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+               int offset;
+               u32 index;
+
+               index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
+               offset = (bio->bi_iter.bi_sector &
+                               (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+
+               zram_bio_discard(zram, index, offset, bio);
+               bio_endio(bio);
+               zram_meta_put(zram);
+               goto out;
+       }
+
+       if (!zram->use_aio || !op_is_write(bio_op(bio))) {
+               __zram_make_sync_request(zram, bio);
+       } else {
+               if (__zram_make_async_request(zram, bio))
+                       __zram_make_sync_request(zram, bio);
+       }
+
+       return BLK_QC_T_NONE;
+
+fail:
+       zram_meta_put(zram);
+error:
+       bio_io_error(bio);
+out:
+       return BLK_QC_T_NONE;
+}
+
+static void zram_slot_free_notify(struct block_device *bdev,
+                               unsigned long index)
+{
+       struct zram *zram;
+       struct zram_meta *meta;
+
+       zram = bdev->bd_disk->private_data;
+       meta = zram->meta;
+
+       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+       zram_free_page(zram, index);
+       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+       atomic64_inc(&zram->stats.notify_free);
 }
 
 static void zram_reset_device(struct zram *zram)
@@ -1190,6 +1632,7 @@ static DEVICE_ATTR_RW(mem_limit);
 static DEVICE_ATTR_RW(mem_used_max);
 static DEVICE_ATTR_RW(max_comp_streams);
 static DEVICE_ATTR_RW(comp_algorithm);
+static DEVICE_ATTR_RW(use_aio);
 
 static struct attribute *zram_disk_attrs[] = {
        &dev_attr_disksize.attr,
@@ -1210,6 +1653,7 @@ static struct attribute *zram_disk_attrs[] = {
        &dev_attr_mem_used_max.attr,
        &dev_attr_max_comp_streams.attr,
        &dev_attr_comp_algorithm.attr,
+       &dev_attr_use_aio.attr,
        &dev_attr_io_stat.attr,
        &dev_attr_mm_stat.attr,
        &dev_attr_debug_stat.attr,
@@ -1464,6 +1908,9 @@ static int __init zram_init(void)
                num_devices--;
        }
 
+       if (create_workers())
+               goto out_error;
+
        return 0;
 
 out_error:
@@ -1474,6 +1921,7 @@ static int __init zram_init(void)
 static void __exit zram_exit(void)
 {
        destroy_devices();
+       destroy_workers();
 }
 
 module_init(zram_init);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 74fcf10da374..4819a33ab1cf 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -119,5 +119,6 @@ struct zram {
         * zram is claimed so open request will be failed
         */
        bool claim; /* Protected by bdev->bd_mutex */
+       bool use_aio; /* asynchronous IO mode */
 };
 #endif
-- 
2.7.4

[RFC] zram: support page-based parallel write

Reply via email to