Block device emulation on top of ubi volumes with read/write support.
Block devices get automatically created for each ubi volume present.

Each ubiblock is fairly cheap since it's based on workqueues
and not on threads.

Read/write access is expected to work fairly well because the
request queue at block elevator orders block transfers to be space-effective.
In other words, it's expected that reads and writes gets ordered
to point to the same LEB.

To help this and reduce access to the UBI volume, a 1-LEB size
write-back cache has been implemented.
Every read and every write, goes through this cache and the write is
only done when a request arrives to read or write to a different LEB
or when the device is released, when the last file handle is closed.

This cache is 1-LEB bytes, vmalloced at open() and freed at release().

Cc: Artem Bityutskiy <dedeki...@gmail.com>
Signed-off-by: Ezequiel Garcia <elezegar...@gmail.com>
---
 drivers/mtd/ubi/Kconfig    |   12 +
 drivers/mtd/ubi/Makefile   |    1 +
 drivers/mtd/ubi/ubiblock.c |  673 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 686 insertions(+), 0 deletions(-)
 create mode 100644 drivers/mtd/ubi/ubiblock.c

diff --git a/drivers/mtd/ubi/Kconfig b/drivers/mtd/ubi/Kconfig
index 36663af..aa6c592 100644
--- a/drivers/mtd/ubi/Kconfig
+++ b/drivers/mtd/ubi/Kconfig
@@ -87,4 +87,16 @@ config MTD_UBI_GLUEBI
           work on top of UBI. Do not enable this unless you use legacy
           software.
 
+config MTD_UBI_BLOCK
+       tristate "Caching block device access to UBI volumes"
+       help
+          Since UBI already takes care of eraseblock wear leveling
+          and bad block handling, it's possible to implement a block
+          device on top of it and therefore mount regular filesystems
+          (i.e. not flash-oriented, as ext4).
+
+          In other words, this is a software flash translation layer.
+
+          If in doubt, say "N".
+
 endif # MTD_UBI
diff --git a/drivers/mtd/ubi/Makefile b/drivers/mtd/ubi/Makefile
index b46b0c97..1578733 100644
--- a/drivers/mtd/ubi/Makefile
+++ b/drivers/mtd/ubi/Makefile
@@ -5,3 +5,4 @@ ubi-y += misc.o debug.o
 ubi-$(CONFIG_MTD_UBI_FASTMAP) += fastmap.o
 
 obj-$(CONFIG_MTD_UBI_GLUEBI) += gluebi.o
+obj-$(CONFIG_MTD_UBI_BLOCK) += ubiblock.o
diff --git a/drivers/mtd/ubi/ubiblock.c b/drivers/mtd/ubi/ubiblock.c
new file mode 100644
index 0000000..97655c1
--- /dev/null
+++ b/drivers/mtd/ubi/ubiblock.c
@@ -0,0 +1,673 @@
+/*
+ * Copyright (c) 2012 Ezequiel Garcia
+ * Copyright (c) 2011 Free Electrons
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ */
+
+/*#define DEBUG*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/mtd/ubi.h>
+#include <linux/workqueue.h>
+#include <linux/blkdev.h>
+
+#include "ubi-media.h"
+
+struct ubiblock {
+       struct ubi_volume_desc *desc;
+       struct ubi_volume_info *vi;
+       int ubi_num;
+       int vol_id;
+       int refcnt;
+
+       struct gendisk *gd;
+       struct request_queue *rq;
+
+       struct workqueue_struct *wq;
+       struct work_struct work;
+
+       struct mutex vol_mutex;
+       spinlock_t queue_lock;
+       struct list_head list;
+
+       enum { STATE_EMPTY, STATE_CLEAN, STATE_DIRTY } cache_state;
+       void *cache;
+       int cache_leb_num;
+       int leb_size;
+
+#ifdef DEBUG
+       /*
+        * TODO: Output this information through a debugfs file.
+        * We can re-use ubi debugfs directories.
+        */
+       unsigned cache_read_hit, cache_read_miss;
+       unsigned cache_write_hit, cache_write_miss;
+#endif
+};
+
+/* Linked list of all ubiblock instances */
+static LIST_HEAD(ubiblock_devices);
+static DEFINE_MUTEX(devices_mutex);
+static int ubiblock_major;
+
+static struct ubiblock *find_dev_nolock(int ubi_num, int vol_id)
+{
+       struct ubiblock *dev;
+
+       list_for_each_entry(dev, &ubiblock_devices, list)
+               if (dev->ubi_num == ubi_num && dev->vol_id == vol_id)
+                       return dev;
+       return NULL;
+}
+
+static bool leb_on_cache(struct ubiblock *dev, int leb_num)
+{
+       return dev->cache_leb_num == leb_num;
+}
+
+static int ubiblock_fill_cache(struct ubiblock *dev, int leb_num)
+{
+       int ret;
+
+       /* Warn if we fill cache while being dirty */
+       WARN_ON(dev->cache_state == STATE_DIRTY);
+
+       dev->cache_leb_num = leb_num;
+       dev->cache_state = STATE_CLEAN;
+
+       ret = ubi_read(dev->desc, leb_num, dev->cache, 0, dev->leb_size);
+       if (ret) {
+               dev_err(disk_to_dev(dev->gd), "ubi_read error %d\n", ret);
+               return ret;
+       }
+       return 0;
+}
+
+static int ubiblock_flush(struct ubiblock *dev, bool sync)
+{
+       int ret = 0;
+
+       if (dev->cache_state != STATE_DIRTY)
+               return 0;
+
+       /*
+        * TODO: mtdblock sets STATE_EMPTY, arguing that it prevents the
+        * underlying media to get changed without notice.
+        * I'm not fully convinced, so I just put STATE_CLEAN.
+        */
+       dev->cache_state = STATE_CLEAN;
+
+       /* Atomically change leb with buffer contents */
+       ret = ubi_leb_change(dev->desc, dev->cache_leb_num,
+                            dev->cache, dev->leb_size);
+       if (ret) {
+               dev_err(disk_to_dev(dev->gd), "ubi_leb_change error %d\n", ret);
+               return ret;
+       }
+
+       /* Sync ubi device when device is released and on block flush ioctl */
+       if (sync)
+               ret = ubi_sync(dev->ubi_num);
+
+       return ret;
+}
+
+static int ubiblock_read(struct ubiblock *dev, char *buffer,
+                        int pos, int len)
+{
+       int leb, offset, ret;
+       int bytes_left = len;
+       int to_read = len;
+       bool cached;
+
+       /* Get leb:offset address to read from */
+       leb = pos / dev->leb_size;
+       offset = pos % dev->leb_size;
+
+       while (bytes_left) {
+
+               /*
+                * We can only read one leb at a time.
+                * Therefore if the read length is larger than
+                * one leb size, we split the operation.
+                */
+               if (offset + to_read > dev->leb_size)
+                       to_read = dev->leb_size - offset;
+
+               /*
+                * If leb is not cached, we flush current cached leb to disk
+                * and read new leb to cache. Then we read from cache to buffer.
+                * This means we share the cache between reads and writes.
+                *
+                * Might this be suboptimal, it's possible to:
+                * 1. Split caches, though this looks overly complicated.
+                * 2. Don't read always from cache, but rather from cache only
+                *    if the leb is cached, and from disk otherwise.
+                */
+               cached = leb_on_cache(dev, leb);
+               if (!cached) {
+                       ret = ubiblock_flush(dev, false);
+                       if (ret)
+                               return ret;
+
+                       ret = ubiblock_fill_cache(dev, leb);
+                       if (ret)
+                               return ret;
+               }
+               memcpy(buffer, dev->cache + offset, to_read);
+
+               buffer += to_read;
+               bytes_left -= to_read;
+               to_read = bytes_left;
+               leb++;
+               offset = 0;
+#ifdef DEBUG
+               if (cached)
+                       dev->cache_read_hit++;
+               else
+                       dev->cache_read_miss++;
+#endif
+       }
+       return 0;
+}
+
+static int ubiblock_write(struct ubiblock *dev, const char *buffer,
+                        int pos, int len)
+{
+       int leb, offset, ret;
+       int bytes_left = len;
+       int to_write = len;
+       bool cached;
+
+       /* Get (leb:offset) address to write */
+       leb = pos / dev->leb_size;
+       offset = pos % dev->leb_size;
+
+       while (bytes_left) {
+               /*
+                * We can only write one leb at a time.
+                * Therefore if the write length is larger than
+                * one leb size, we split the operation.
+                */
+               if (offset + to_write > dev->leb_size)
+                       to_write = dev->leb_size - offset;
+
+               /*
+                * If leb is not cached, we flush current cached leb to disk
+                * and read new leb to cache. Then we write to cached buffer.
+                */
+               cached = leb_on_cache(dev, leb);
+               if (!cached) {
+                       ret = ubiblock_flush(dev, false);
+                       if (ret)
+                               return ret;
+
+                       ret = ubiblock_fill_cache(dev, leb);
+                       if (ret)
+                               return ret;
+               }
+
+               /* Write to local cache */
+               memcpy(dev->cache + offset, buffer, to_write);
+
+               /* This is the only place where we dirt the cache */
+               dev->cache_state = STATE_DIRTY;
+
+               buffer += to_write;
+               bytes_left -= to_write;
+               to_write = bytes_left;
+               offset = 0;
+               leb++;
+#ifdef DEBUG
+               if (cached)
+                       dev->cache_write_hit++;
+               else
+                       dev->cache_write_miss++;
+#endif
+       }
+       return 0;
+}
+
+static int do_ubiblock_request(struct ubiblock *dev, struct request *req)
+{
+       int pos, len;
+
+       if (req->cmd_type != REQ_TYPE_FS)
+               return -EIO;
+
+       if (blk_rq_pos(req) + blk_rq_cur_sectors(req) >
+           get_capacity(req->rq_disk))
+               return -EIO;
+
+       pos = blk_rq_pos(req) << 9;
+       len = blk_rq_cur_bytes(req);
+
+       switch (rq_data_dir(req)) {
+       case READ:
+               return ubiblock_read(dev, req->buffer, pos, len);
+       case WRITE:
+               return ubiblock_write(dev, req->buffer, pos, len);
+       default:
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static void ubiblock_do_work(struct work_struct *work)
+{
+       struct ubiblock *dev =
+               container_of(work, struct ubiblock, work);
+       struct request_queue *rq = dev->rq;
+       struct request *req;
+       int res;
+
+       spin_lock_irq(rq->queue_lock);
+
+       req = blk_fetch_request(rq);
+       while (req) {
+
+               spin_unlock_irq(rq->queue_lock);
+
+               mutex_lock(&dev->vol_mutex);
+               res = do_ubiblock_request(dev, req);
+               mutex_unlock(&dev->vol_mutex);
+
+               spin_lock_irq(rq->queue_lock);
+
+               /*
+                * If we're done with this request,
+                * we need to fetch a new one
+                */
+               if (!__blk_end_request_cur(req, res))
+                       req = blk_fetch_request(rq);
+       }
+
+       spin_unlock_irq(rq->queue_lock);
+}
+
+static void ubiblock_request(struct request_queue *rq)
+{
+       struct ubiblock *dev;
+       struct request *req;
+
+       dev = rq->queuedata;
+
+       if (!dev)
+               while ((req = blk_fetch_request(rq)) != NULL)
+                       __blk_end_request_all(req, -ENODEV);
+       else
+               queue_work(dev->wq, &dev->work);
+}
+
+static int ubiblock_open(struct block_device *bdev, fmode_t mode)
+{
+       struct ubiblock *dev = bdev->bd_disk->private_data;
+       int ubi_mode = UBI_READONLY;
+       int ret;
+
+       mutex_lock(&dev->vol_mutex);
+       if (dev->refcnt > 0) {
+               /*
+                * The volume is already opened,
+                * just increase the reference counter
+                */
+               dev->refcnt++;
+               mutex_unlock(&dev->vol_mutex);
+               return 0;
+       }
+
+       if (mode & FMODE_WRITE)
+               ubi_mode = UBI_READWRITE;
+
+       dev->desc = ubi_open_volume(dev->ubi_num, dev->vol_id, ubi_mode);
+       if (IS_ERR(dev->desc)) {
+               dev_err(disk_to_dev(dev->gd),
+                       "failed to open ubi volume %d_%d\n",
+                       dev->ubi_num, dev->vol_id);
+
+               ret = PTR_ERR(dev->desc);
+               dev->desc = NULL;
+               goto out_unlock;
+       }
+
+       dev->vi = kzalloc(sizeof(struct ubi_volume_info), GFP_KERNEL);
+       if (!dev->vi) {
+               ret = -ENOMEM;
+               goto out_close;
+       }
+       ubi_get_volume_info(dev->desc, dev->vi);
+
+       /* Allocate cache buffer, mtdblock uses vmalloc and we do too */
+       dev->leb_size = dev->vi->usable_leb_size;
+       dev->cache_leb_num = -1;
+       dev->cache = vmalloc(dev->leb_size);
+       if (!dev->cache) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
+
+       dev->refcnt++;
+       mutex_unlock(&dev->vol_mutex);
+       return 0;
+
+out_free:
+       kfree(dev->vi);
+out_close:
+       ubi_close_volume(dev->desc);
+       dev->desc = NULL;
+out_unlock:
+       mutex_unlock(&dev->vol_mutex);
+       return ret;
+}
+
+static int ubiblock_release(struct gendisk *gd, fmode_t mode)
+{
+       struct ubiblock *dev = gd->private_data;
+
+       mutex_lock(&dev->vol_mutex);
+
+       dev->refcnt--;
+       if (dev->refcnt == 0) {
+               ubiblock_flush(dev, true);
+
+               vfree(dev->cache);
+               dev->cache_leb_num = -1;
+               dev->cache_state = STATE_EMPTY;
+
+               kfree(dev->vi);
+               ubi_close_volume(dev->desc);
+
+               dev->vi = NULL;
+               dev->desc = NULL;
+       }
+
+       mutex_unlock(&dev->vol_mutex);
+       return 0;
+}
+
+static int ubiblock_ioctl(struct block_device *bdev, fmode_t mode,
+                             unsigned int cmd, unsigned long arg)
+{
+       struct ubiblock *dev = bdev->bd_disk->private_data;
+       int ret = -ENXIO;
+
+       if (!dev)
+               return ret;
+
+       mutex_lock(&dev->vol_mutex);
+
+       /* I can't get this to get called. What's going on? */
+       switch (cmd) {
+       case BLKFLSBUF:
+               ret = ubiblock_flush(dev, true);
+               break;
+       default:
+               ret = -ENOTTY;
+       }
+
+       mutex_unlock(&dev->vol_mutex);
+       return ret;
+}
+
+static const struct block_device_operations ubiblock_ops = {
+       .owner = THIS_MODULE,
+       .open = ubiblock_open,
+       .release = ubiblock_release,
+       .ioctl = ubiblock_ioctl,
+};
+
+static int ubiblock_add(struct ubi_volume_info *vi)
+{
+       struct ubiblock *dev;
+       struct gendisk *gd;
+       int disk_capacity;
+       int ret;
+
+       /* Check that the volume isn't already handled */
+       mutex_lock(&devices_mutex);
+       if (find_dev_nolock(vi->ubi_num, vi->vol_id)) {
+               mutex_unlock(&devices_mutex);
+               return -EEXIST;
+       }
+       mutex_unlock(&devices_mutex);
+
+       dev = kzalloc(sizeof(struct ubiblock), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
+
+       mutex_init(&dev->vol_mutex);
+
+       dev->ubi_num = vi->ubi_num;
+       dev->vol_id = vi->vol_id;
+
+       /* Initialize the gendisk of this ubiblock device */
+       gd = alloc_disk(1);
+       if (!gd) {
+               pr_err("alloc_disk failed\n");
+               ret = -ENODEV;
+               goto out_free_dev;
+       }
+
+       gd->fops = &ubiblock_ops;
+       gd->major = ubiblock_major;
+       gd->first_minor = dev->ubi_num * UBI_MAX_VOLUMES + dev->vol_id;
+       gd->private_data = dev;
+       sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id);
+       disk_capacity = (vi->size * vi->usable_leb_size) >> 9;
+       set_capacity(gd, disk_capacity);
+       dev->gd = gd;
+
+       spin_lock_init(&dev->queue_lock);
+       dev->rq = blk_init_queue(ubiblock_request, &dev->queue_lock);
+       if (!dev->rq) {
+               pr_err("blk_init_queue failed\n");
+               ret = -ENODEV;
+               goto out_put_disk;
+       }
+
+       dev->rq->queuedata = dev;
+       dev->gd->queue = dev->rq;
+
+       /* TODO: Is performance better or worse with this flag? */
+       /* queue_flag_set_unlocked(QUEUE_FLAG_NONROT, dev->rq);*/
+
+       /*
+        * Create one workqueue per volume (per registered block device).
+        * Rembember workqueues are cheap, they're not threads.
+        */
+       dev->wq = alloc_workqueue(gd->disk_name, 0, 0);
+       if (!dev->wq)
+               goto out_free_queue;
+       INIT_WORK(&dev->work, ubiblock_do_work);
+
+       mutex_lock(&devices_mutex);
+       list_add_tail(&dev->list, &ubiblock_devices);
+       mutex_unlock(&devices_mutex);
+
+       /* Must be the last step: anyone can call file ops from now on */
+       add_disk(dev->gd);
+
+       dev_info(disk_to_dev(dev->gd), "created from ubi%d:%d(%s)\n",
+                dev->ubi_num, dev->vol_id, vi->name);
+
+       return 0;
+
+out_free_queue:
+       blk_cleanup_queue(dev->rq);
+out_put_disk:
+       put_disk(dev->gd);
+out_free_dev:
+       kfree(dev);
+
+       return ret;
+}
+
+static void ubiblock_cleanup(struct ubiblock *dev)
+{
+#ifdef DEBUG
+       pr_debug("%s: read hit/miss %d/%d, write hit/miss %d/%d\n",
+               dev->gd->disk_name,
+               dev->cache_read_hit, dev->cache_read_miss,
+               dev->cache_write_hit, dev->cache_write_miss);
+#endif
+       del_gendisk(dev->gd);
+       blk_cleanup_queue(dev->rq);
+       put_disk(dev->gd);
+}
+
+static int ubiblock_del(struct ubi_volume_info *vi)
+{
+       struct ubiblock *dev;
+
+       mutex_lock(&devices_mutex);
+       dev = find_dev_nolock(vi->ubi_num, vi->vol_id);
+       if (!dev) {
+               mutex_unlock(&devices_mutex);
+               pr_warn("trying to remove %s, but it isn't handled\n",
+                       vi->name);
+               return -ENODEV;
+       }
+       /* Remove from device list */
+       list_del(&dev->list);
+       mutex_unlock(&devices_mutex);
+
+       /* Flush pending work and stop this workqueue */
+       destroy_workqueue(dev->wq);
+
+       mutex_lock(&dev->vol_mutex);
+
+       /*
+        * This means that ubiblock device is opened and in usage.
+        * However, this shouldn't happen, since we have
+        * called ubi_open_volume() at open() time, thus preventing
+        * volume removal.
+        */
+       WARN_ON(dev->desc);
+       ubiblock_cleanup(dev);
+
+       mutex_unlock(&dev->vol_mutex);
+
+       kfree(dev);
+
+       return 0;
+}
+
+static int ubiblock_resize(struct ubi_volume_info *vi)
+{
+       struct ubiblock *dev;
+       int disk_capacity;
+
+       /*
+        * We don't touch the list, but we better lock it: it could be that the
+        * device gets removed between the time the device has been found and
+        * the time we access dev->gd
+        */
+       mutex_lock(&devices_mutex);
+       dev = find_dev_nolock(vi->ubi_num, vi->vol_id);
+       if (!dev) {
+               mutex_unlock(&devices_mutex);
+               pr_warn("trying to resize %s, which isn't handled\n",
+                       vi->name);
+               return -ENODEV;
+       }
+       mutex_unlock(&devices_mutex);
+
+       mutex_lock(&dev->vol_mutex);
+       disk_capacity = (vi->size * vi->usable_leb_size) >> 9;
+       set_capacity(dev->gd, disk_capacity);
+       dev_dbg(disk_to_dev(dev->gd), "resized to %d LEBs\n", vi->size);
+       mutex_unlock(&dev->vol_mutex);
+
+       return 0;
+}
+
+static int ubiblock_notify(struct notifier_block *nb,
+                        unsigned long notification_type, void *ns_ptr)
+{
+       struct ubi_notification *nt = ns_ptr;
+
+       switch (notification_type) {
+       case UBI_VOLUME_ADDED:
+               ubiblock_add(&nt->vi);
+               break;
+       case UBI_VOLUME_REMOVED:
+               ubiblock_del(&nt->vi);
+               break;
+       case UBI_VOLUME_RESIZED:
+               ubiblock_resize(&nt->vi);
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block ubiblock_notifier = {
+       .notifier_call = ubiblock_notify,
+};
+
+static int __init ubiblock_init(void)
+{
+       ubiblock_major = register_blkdev(0, "ubiblock");
+       if (ubiblock_major < 0)
+               return ubiblock_major;
+
+       /*
+        * Blocks will get registered dynamically.
+        * Each ubi volume will get a corresponding block device.
+        */
+       return ubi_register_volume_notifier(&ubiblock_notifier, 0);
+}
+
+static void __exit ubiblock_exit(void)
+{
+       struct ubiblock *next;
+       struct ubiblock *dev;
+
+       ubi_unregister_volume_notifier(&ubiblock_notifier);
+
+       list_for_each_entry_safe(dev, next, &ubiblock_devices, list) {
+
+               /* Flush pending work and stop workqueue */
+               destroy_workqueue(dev->wq);
+
+               /* The module is being forcefully removed */
+               WARN_ON(dev->desc);
+
+               /* Remove from device list */
+               list_del(&dev->list);
+
+               ubiblock_cleanup(dev);
+
+               kfree(dev);
+       }
+
+       unregister_blkdev(ubiblock_major, "ubiblock");
+}
+
+module_init(ubiblock_init);
+module_exit(ubiblock_exit);
+
+MODULE_DESCRIPTION("Block device emulation access to UBI volumes");
+MODULE_AUTHOR("David Wagner");
+MODULE_AUTHOR("Ezequiel Garcia <elezegar...@gmail.com>");
+MODULE_LICENSE("GPL");
-- 
1.7.8.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to