I have changed cvs commit message for > dm.h dm_target_stripe.c
to Add multi device strip support written by Guillermo Amaral and reviewed by me. On Oct,Saturday 23 2010, at 11:18 PM, Adam Hamsik wrote: > Module Name: src > Committed By: haad > Date: Sat Oct 23 21:18:55 UTC 2010 > > Modified Files: > src/sys/dev/dm: device-mapper.c dm.h dm_target_stripe.c > Added Files: > src/sys/dev/dm/doc: locking.txt > > Log Message: > Add old file describing locking schema used in dm driver. > > > To generate a diff of this commit: > cvs rdiff -u -r1.24 -r1.25 src/sys/dev/dm/device-mapper.c > cvs rdiff -u -r1.18 -r1.19 src/sys/dev/dm/dm.h > cvs rdiff -u -r1.10 -r1.11 src/sys/dev/dm/dm_target_stripe.c > cvs rdiff -u -r0 -r1.1 src/sys/dev/dm/doc/locking.txt > > Please note that diffs are not public domain; they are subject to the > copyright notices on the relevant files. > > Modified files: > > Index: src/sys/dev/dm/device-mapper.c > diff -u src/sys/dev/dm/device-mapper.c:1.24 > src/sys/dev/dm/device-mapper.c:1.25 > --- src/sys/dev/dm/device-mapper.c:1.24 Sat Oct 9 12:56:06 2010 > +++ src/sys/dev/dm/device-mapper.c Sat Oct 23 21:18:54 2010 > @@ -1,4 +1,4 @@ > -/* $NetBSD: device-mapper.c,v 1.24 2010/10/09 12:56:06 haad Exp $ */ > +/* $NetBSD: device-mapper.c,v 1.25 2010/10/23 21:18:54 haad Exp $ */ > > /* > * Copyright (c) 2010 The NetBSD Foundation, Inc. > @@ -350,7 +350,6 @@ > r = 0; > > aprint_debug("dmioctl called\n"); > - > KASSERT(data != NULL); > > if (( r = disk_ioctl_switch(dev, cmd, data)) == ENOTTY) { > > Index: src/sys/dev/dm/dm.h > diff -u src/sys/dev/dm/dm.h:1.18 src/sys/dev/dm/dm.h:1.19 > --- src/sys/dev/dm/dm.h:1.18 Tue May 18 15:10:41 2010 > +++ src/sys/dev/dm/dm.h Sat Oct 23 21:18:54 2010 > @@ -1,4 +1,4 @@ > -/* $NetBSD: dm.h,v 1.18 2010/05/18 15:10:41 haad Exp $ */ > +/* $NetBSD: dm.h,v 1.19 2010/10/23 21:18:54 haad Exp $ */ > > /* > * Copyright (c) 2008 The NetBSD Foundation, Inc. > @@ -170,12 +170,23 @@ > typedef struct target_linear_config { > dm_pdev_t *pdev; > uint64_t offset; > + TAILQ_ENTRY(target_linear_config) entries; > } dm_target_linear_config_t; > > +/* > + * Striping devices are stored in a linked list, this might be inefficient > + * for more than 8 striping devices and can be changed to something more > + * scalable. > + * TODO: look for other options than linked list. > + */ > +TAILQ_HEAD(target_linear_devs, target_linear_config); > + > +typedef struct target_linear_devs dm_target_linear_devs_t; > + > /* for stripe : */ > typedef struct target_stripe_config { > -#define MAX_STRIPES 2 > - struct target_linear_config stripe_devs[MAX_STRIPES]; > +#define DM_STRIPE_DEV_OFFSET 2 > + struct target_linear_devs stripe_devs; > uint8_t stripe_num; > uint64_t stripe_chunksize; > size_t params_len; > > Index: src/sys/dev/dm/dm_target_stripe.c > diff -u src/sys/dev/dm/dm_target_stripe.c:1.10 > src/sys/dev/dm/dm_target_stripe.c:1.11 > --- src/sys/dev/dm/dm_target_stripe.c:1.10 Tue May 18 15:10:41 2010 > +++ src/sys/dev/dm/dm_target_stripe.c Sat Oct 23 21:18:54 2010 > @@ -1,4 +1,4 @@ > -/*$NetBSD: dm_target_stripe.c,v 1.10 2010/05/18 15:10:41 haad Exp $*/ > +/*$NetBSD: dm_target_stripe.c,v 1.11 2010/10/23 21:18:54 haad Exp $*/ > > /* > * Copyright (c) 2009 The NetBSD Foundation, Inc. > @@ -102,6 +102,8 @@ > > /* > * Init function called from dm_table_load_ioctl. > + * DM_STRIPE_DEV_OFFSET should always hold the index of the first > device-offset > + * pair in the parameters. > * Example line sent to dm from lvm tools when using striped target. > * start length striped #stripes chunk_size device1 offset1 ... deviceN > offsetN > * 0 65536 striped 2 512 /dev/hda 0 /dev/hdb 0 > @@ -109,9 +111,11 @@ > int > dm_target_stripe_init(dm_dev_t * dmv, void **target_config, char *params) > { > + dm_target_linear_config_t *tlc; > dm_target_stripe_config_t *tsc; > size_t len; > char **ap, *argv[10]; > + int strpc, strpi; > > if (params == NULL) > return EINVAL; > @@ -130,33 +134,34 @@ > > printf("Stripe target init function called!!\n"); > > - printf("Stripe target chunk size %s number of stripes %s\n", argv[1], > argv[0]); > - printf("Stripe target device name %s -- offset %s\n", argv[2], argv[3]); > - printf("Stripe target device name %s -- offset %s\n", argv[4], argv[5]); > + printf("Stripe target chunk size %s number of stripes %s\n", > + argv[1], argv[0]); > > - if (atoi(argv[0]) > MAX_STRIPES) > - return ENOTSUP; > - > - if ((tsc = kmem_alloc(sizeof(dm_target_stripe_config_t), KM_NOSLEEP)) > - == NULL) > + if ((tsc = kmem_alloc(sizeof(*tsc), KM_NOSLEEP)) == NULL) > return ENOMEM; > > - /* Insert dmp to global pdev list */ > - if ((tsc->stripe_devs[0].pdev = dm_pdev_insert(argv[2])) == NULL) > - return ENOENT; > - > - /* Insert dmp to global pdev list */ > - if ((tsc->stripe_devs[1].pdev = dm_pdev_insert(argv[4])) == NULL) > - return ENOENT; > - > - tsc->stripe_devs[0].offset = atoi(argv[3]); > - tsc->stripe_devs[1].offset = atoi(argv[5]); > + /* Initialize linked list for striping devices */ > + TAILQ_INIT(&tsc->stripe_devs); > > /* Save length of param string */ > tsc->params_len = len; > tsc->stripe_chunksize = atoi(argv[1]); > tsc->stripe_num = (uint8_t) atoi(argv[0]); > > + strpc = DM_STRIPE_DEV_OFFSET + (tsc->stripe_num * 2); > + for (strpi = DM_STRIPE_DEV_OFFSET; strpi < strpc; strpi += 2) { > + printf("Stripe target device name %s -- offset %s\n", > + argv[strpi], argv[strpi+1]); > + > + tlc = kmem_alloc(sizeof(*tlc), KM_NOSLEEP); > + if ((tlc->pdev = dm_pdev_insert(argv[strpi])) == NULL) > + return ENOENT; > + tlc->offset = atoi(argv[strpi+1]); > + > + /* Insert striping device to linked list. */ > + TAILQ_INSERT_TAIL(&tsc->stripe_devs, tlc, entries); > + } > + > *target_config = tsc; > > dmv->dev_type = DM_STRIPE_DEV; > @@ -167,18 +172,28 @@ > char * > dm_target_stripe_status(void *target_config) > { > + dm_target_linear_config_t *tlc; > dm_target_stripe_config_t *tsc; > - char *params; > + char *params, *tmp; > > tsc = target_config; > > if ((params = kmem_alloc(DM_MAX_PARAMS_SIZE, KM_SLEEP)) == NULL) > return NULL; > > - snprintf(params, DM_MAX_PARAMS_SIZE, "%d %" PRIu64 " %s %" PRIu64 " %s > %" PRIu64, > - tsc->stripe_num, tsc->stripe_chunksize, > - tsc->stripe_devs[0].pdev->name, tsc->stripe_devs[0].offset, > - tsc->stripe_devs[1].pdev->name, tsc->stripe_devs[1].offset); > + if ((tmp = kmem_alloc(DM_MAX_PARAMS_SIZE, KM_SLEEP)) == NULL) > + return NULL; > + > + snprintf(params, DM_MAX_PARAMS_SIZE, "%d %" PRIu64, > + tsc->stripe_num, tsc->stripe_chunksize); > + > + TAILQ_FOREACH(tlc, &tsc->stripe_devs, entries) { > + snprintf(tmp, DM_MAX_PARAMS_SIZE, " %s %" PRIu64, > + tlc->pdev->name, tlc->offset); > + strcat(params, tmp); > + } > + > + kmem_free(tmp, DM_MAX_PARAMS_SIZE); > > return params; > } > @@ -186,12 +201,13 @@ > int > dm_target_stripe_strategy(dm_table_entry_t * table_en, struct buf * bp) > { > + dm_target_linear_config_t *tlc; > dm_target_stripe_config_t *tsc; > struct buf *nestbuf; > uint64_t blkno, blkoff; > uint64_t stripe, stripe_blknr; > uint32_t stripe_off, stripe_rest, num_blks, issue_blks; > - int stripe_devnr; > + int i, stripe_devnr; > > tsc = table_en->target_config; > if (tsc == NULL) > @@ -224,9 +240,17 @@ > > nestiobuf_setup(bp, nestbuf, blkoff, issue_blks * DEV_BSIZE); > nestbuf->b_blkno = stripe_blknr * tsc->stripe_chunksize + > stripe_off; > - nestbuf->b_blkno += tsc->stripe_devs[stripe_devnr].offset; > > - VOP_STRATEGY(tsc->stripe_devs[stripe_devnr].pdev->pdev_vnode, > nestbuf); > + tlc = TAILQ_FIRST(&tsc->stripe_devs); > + for (i = 0; i < stripe_devnr && tlc == NULL; i++) > + tlc = TAILQ_NEXT(tlc, entries); > + > + /* by this point we should have an tlc */ > + KASSERT(tlc == NULL); > + > + nestbuf->b_blkno += tlc->offset; > + > + VOP_STRATEGY(tlc->pdev->pdev_vnode, nestbuf); > > blkno += issue_blks; > blkoff += issue_blks * DEV_BSIZE; > @@ -242,16 +266,17 @@ > int > dm_target_stripe_sync(dm_table_entry_t * table_en) > { > - int cmd, err, i; > + int cmd, err; > dm_target_stripe_config_t *tsc; > + dm_target_linear_config_t *tlc; > > tsc = table_en->target_config; > > err = 0; > cmd = 1; > > - for (i = 0; i < tsc->stripe_num; i++) { > - if ((err = VOP_IOCTL(tsc->stripe_devs[i].pdev->pdev_vnode, > DIOCCACHESYNC, > + TAILQ_FOREACH(tlc, &tsc->stripe_devs, entries) { > + if ((err = VOP_IOCTL(tlc->pdev->pdev_vnode, DIOCCACHESYNC, > &cmd, FREAD|FWRITE, kauth_cred_get())) != 0) > return err; > } > @@ -264,19 +289,23 @@ > dm_target_stripe_destroy(dm_table_entry_t * table_en) > { > dm_target_stripe_config_t *tsc; > + dm_target_linear_config_t *tlc; > > tsc = table_en->target_config; > > if (tsc == NULL) > return 0; > > - dm_pdev_decr(tsc->stripe_devs[0].pdev); > - dm_pdev_decr(tsc->stripe_devs[1].pdev); > + while ((tlc = TAILQ_FIRST(&tsc->stripe_devs)) != NULL) { > + TAILQ_REMOVE(&tsc->stripe_devs, tlc, entries); > + dm_pdev_decr(tlc->pdev); > + kmem_free(tlc, sizeof(*tlc)); > + } > > /* Unbusy target so we can unload it */ > dm_target_unbusy(table_en->target); > > - kmem_free(tsc, sizeof(dm_target_stripe_config_t)); > + kmem_free(tsc, sizeof(*tsc)); > > table_en->target_config = NULL; > > @@ -287,6 +316,7 @@ > dm_target_stripe_deps(dm_table_entry_t * table_en, prop_array_t prop_array) > { > dm_target_stripe_config_t *tsc; > + dm_target_linear_config_t *tlc; > struct vattr va; > > int error; > @@ -296,15 +326,12 @@ > > tsc = table_en->target_config; > > - if ((error = VOP_GETATTR(tsc->stripe_devs[0].pdev->pdev_vnode, &va, > curlwp->l_cred)) != 0) > - return error; > - > - prop_array_add_uint64(prop_array, (uint64_t) va.va_rdev); > + TAILQ_FOREACH(tlc, &tsc->stripe_devs, entries) { > + if ((error = VOP_GETATTR(tlc->pdev->pdev_vnode, &va, > curlwp->l_cred)) != 0) > + return error; > > - if ((error = VOP_GETATTR(tsc->stripe_devs[1].pdev->pdev_vnode, &va, > curlwp->l_cred)) != 0) > - return error; > - > - prop_array_add_uint64(prop_array, (uint64_t) va.va_rdev); > + prop_array_add_uint64(prop_array, (uint64_t) va.va_rdev); > + } > > return 0; > } > > Added files: > > Index: src/sys/dev/dm/doc/locking.txt > diff -u /dev/null src/sys/dev/dm/doc/locking.txt:1.1 > --- /dev/null Sat Oct 23 21:18:55 2010 > +++ src/sys/dev/dm/doc/locking.txt Sat Oct 23 21:18:55 2010 > @@ -0,0 +1,263 @@ > + > + Device-mapper Locking architecture > + > +Overview > + > +There are 2 users in device-mapper driver > + a) Users who uses disk drives > + b) Users who uses ioctl management interface > + > +Management is done by dm_dev_*_ioctl and dm_table_*_ioctl routines. There > are > +two major structures used in these routines/device-mapper. > + > +Table entry: > + > +typedef struct dm_table_entry { > + struct dm_dev *dm_dev; /* backlink */ > + uint64_t start; > + uint64_t length; > + > + struct dm_target *target; /* Link to table target. */ > + void *target_config; /* Target specific data. */ > + SLIST_ENTRY(dm_table_entry) next; > +} dm_table_entry_t; > + > +This structure stores every target part of dm device. Every device can have > +more than one target mapping entries stored in a list. This structure > describe > +mapping between logical/physical blocks in dm device. > + > +start length target block device offset > +0 102400 linear /dev/wd1a 384 > +102400 204800 linear /dev/wd2a 384 > +204800 409600 linear /dev/wd3a 384 > + > +Every device has at least two tables ACTIVE and INACTIVE. Only ACTIVE table > is > +used during IO. Every IO operation on dm device have to walk through > dm_table_entries list. > + > +Device entry: > + > +typedef struct dm_dev { > + char name[DM_NAME_LEN]; > + char uuid[DM_UUID_LEN]; > + > + int minor; > + uint32_t flags; /* store communication protocol flags */ > + > + kmutex_t dev_mtx; /* mutex for generall device lock */ > + kcondvar_t dev_cv; /* cv for ioctl synchronisation */ > + > + uint32_t event_nr; > + uint32_t ref_cnt; > + > + uint32_t dev_type; > + > + dm_table_head_t table_head; > + > + struct dm_dev_head upcalls; > + > + struct disklabel *dk_label; /* Disklabel for this table. */ > + > + TAILQ_ENTRY(dm_dev) next_upcall; /* LIST of mirrored, snapshoted > devices. */ > + > + TAILQ_ENTRY(dm_dev) next_devlist; /* Major device list. */ > +} dm_dev_t; > + > +Every device created in dm device-mapper is represented with this structure. > +All devices are stored in a list. Every ioctl routine have to work with this > +structure. > + > + Locking in dm driver > + > +Locking must be done in two ways. Synchronisation between ioctl routines and > +between IO operations and ioctl. Table entries are read during IO and during > some ioctl routines. There are only few routines which manipulates table > lists. > + > +Read access to table list: > + > +dmsize > +dmstrategy > +dm_dev_status_ioctl > +dm_table_info_ioctl > +dm_table_deps_ioctl > +dm_disk_ioctl -> DIOCCACHESYNC ioctl > + > +Write access to table list: > +dm_dev_remove_ioctl -> remove device from list, this routine have to > > + remove all tables. > +dm_dev_resume_ioctl -> Switch tables on suspended device, switch > INACTIVE > + and ACTIVE tables. > +dm_table_clear_ioctl -> Remove INACTIVE table from table list. > + > + > +Synchronisation between readers and writers in table list > + > +I moved everything needed for table synchronisation to struct dm_table_head. > + > +typedef struct dm_table_head { > + /* Current active table is selected with this. */ > + int cur_active_table; > + struct dm_table tables[2]; > + > + kmutex_t table_mtx; > + kcondvar_t table_cv; /*IO waiting cv */ > + > + uint32_t io_cnt; > +} dm_table_head_t; > + > +dm_table_head_t is used as entry for every dm_table synchronisation routine. > + > +Because every table user have to get list to table list head I have > implemented > +these routines to manage access to table lists. > + > +/* > > + * Destroy all table data. This function can run when there are no > > + * readers on table lists. > > + */ > +int dm_table_destroy(dm_table_head_t *, uint8_t); > + > +/* > > + * Return length of active table in device. > > + */ > +uint64_t dm_table_size(dm_table_head_t *); > + > +/* > > + * Return current active table to caller, increment io_cnt reference > counter. > + */ > +struct dm_table * dm_table_get_entry(dm_table_head_t *, uint8_t); > + > +/* > > + * Return > 0 if table is at least one table entry (returns number of > entries) > + * and return 0 if there is not. Target count returned from this function > > + * doesn't need to be true when userspace user receive it (after return > > + * there can be dm_dev_resume_ioctl), therfore this isonly informative. > > + */ > +int dm_table_get_target_count(dm_table_head_t *, uint8_t); > + > +/* > > + * Decrement io reference counter and wake up all callers, with table_head > cv. > + */ > +void dm_table_release(dm_table_head_t *, uint8_t s); > + > +/* > > + * Switch table from inactive to active mode. Have to wait until io_cnt is > 0. > + */ > +void dm_table_switch_tables(dm_table_head_t *); > + > +/* > > + * Initialize table_head structures, I'm trying to keep this structure as > > + * opaque as possible. > > + */ > +void dm_table_head_init(dm_table_head_t *); > + > +/* > > + * Destroy all variables in table_head > > + */ > +void dm_table_head_destroy(dm_table_head_t *); > + > +Internal table synchronisation protocol > + > +Readers: > +dm_table_size > +dm_table_get_target_count > +dm_table_get_target_count > + > +Readers with hold reference counter: > +dm_table_get_entry > +dm_table_release > + > +Writer: > +dm_table_destroy > +dm_table_switch_tables > + > +For managing synchronisation to table lists I use these routines. Every > reader > +uses dm_table_busy routine to hold reference counter during work and > dm_table_unbusy for reference counter release. Every writer have to wait > while > +is reference counter 0 and only then it can work with device. It will sleep > on > +head->table_cv while there are other readers. dm_table_get_entry is specific > in that it will return table with hold reference counter. After > dm_table_get_entry > +every caller must call dm_table_release when it doesn't want to work with > it. > + > +/* > > + * Function to increment table user reference counter. Return id > > + * of table_id table. > > + * DM_TABLE_ACTIVE will return active table id. > > + * DM_TABLE_INACTIVE will return inactive table id. > > + */ > +static int > +dm_table_busy(dm_table_head_t *head, uint8_t table_id) > +{ > + uint8_t id; > + > + id = 0; > + > + mutex_enter(&head->table_mtx); > + > + if (table_id == DM_TABLE_ACTIVE) > + id = head->cur_active_table; > + else > + id = 1 - head->cur_active_table; > + > + head->io_cnt++; > + > + mutex_exit(&head->table_mtx); > + return id; > +} > + > +/* > > + * Function release table lock and eventually wakeup all waiters. > > + */ > +static void > +dm_table_unbusy(dm_table_head_t *head) > +{ > + KASSERT(head->io_cnt != 0); > + > + mutex_enter(&head->table_mtx); > + > + if (--head->io_cnt == 0) > + cv_broadcast(&head->table_cv); > + > + mutex_exit(&head->table_mtx); > +} > + > +Device-mapper betwwen ioctl device synchronisation > + > + > +Every ioctl user have to find dm_device with name, uuid, minor number. > +For this dm_dev_lookup is used. This routine returns device with hold > reference > +counter. > + > +void > +dm_dev_busy(dm_dev_t *dmv) > +{ > + mutex_enter(&dmv->dev_mtx); > + dmv->ref_cnt++; > + mutex_exit(&dmv->dev_mtx); > +} > + > +void > +dm_dev_unbusy(dm_dev_t *dmv) > +{ > + KASSERT(dmv->ref_cnt != 0); > + > + mutex_enter(&dmv->dev_mtx); > + if (--dmv->ref_cnt == 0) > + cv_broadcast(&dmv->dev_cv); > + mutex_exit(&dmv->dev_mtx); > +} > + > +Before returning from ioctl routine must release reference counter with > +dm_dev_unbusy. > + > +dm_dev_remove_ioctl routine have to remove dm_dev from global device list, > +and wait until all ioctl users from dm_dev are gone. > + > + > + > + > + > + > + > + > + > + > + > + > + > + > Regards Adam.