Hello,
      I want to know about Blktap asynchronous i/o read write operations on
VHD. I want to know detail flow of program  block-VHD.c . How the read
write operations are performed on VHD . And any improvement in code would
be done for better read   write operations. Description about dynamic disk
structure. I wand modify code of fast clone copy for copy on read.
Currently data is copy into vhd  after write.
So tell me about how step by step which functions are calling from
block-vhd so fast copy is done.

I have attached the code of block-vhd.c.


Thanks,
Akash Talole
/* 
 * Copyright (C) Citrix Systems Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2.1 only
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

/*
 * block-vhd.c: asynchronous vhd implementation.
 *
 * A note on write transactions:
 * Writes that require updating the BAT or bitmaps cannot be signaled
 * as complete until all updates have reached disk.  Transactions are
 * used to ensure proper ordering in these cases.  The two types of
 * transactions are as follows:
 *   - Bitmap updates only: data writes that require updates to the same
 *     bitmap are grouped in a transaction.  Only after all data writes
 *     in a transaction complete does the bitmap write commence.  Only
 *     after the bitmap write finishes are the data writes signalled as
 *     complete.
 *   - BAT and bitmap updates: data writes are grouped in transactions
 *     as above, but a special extra write is included in the transaction,
 *     which zeros out the newly allocated bitmap on disk.  When the data
 *     writes and the zero-bitmap write complete, the BAT and bitmap writes
 *     are started in parallel.  The transaction is completed only after both
 *     the BAT and bitmap writes successfully return.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <uuid/uuid.h> /* For whatever reason, Linux packages this in */
                       /* e2fsprogs-devel.                            */
#include <string.h>    /* for memset.                                 */
#include <libaio.h>
#include <sys/mman.h>
#include <limits.h>

#include "debug.h"
#include "libvhd.h"
#include "tapdisk.h"
#include "tapdisk-driver.h"
#include "tapdisk-interface.h"
#include "tapdisk-disktype.h"
#include "tapdisk-storage.h"

unsigned int SPB;

#define DEBUGGING   2
#define MICROSOFT_COMPAT

#define VHD_BATMAP_MAX_RETRIES 10

#define __TRACE(s)							\
	do {								\
		DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %"	\
		    PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: "	\
		    "%u, BBLK: 0x%04x\n",				\
		    s->vhd.file, s->queued, s->completed, s->returned,	\
		    VHD_REQS_DATA - s->vreq_free_count,			\
		    s->bat.pbw_blk);					\
	} while(0)

#if (DEBUGGING == 1)
  #define DBG(level, _f, _a...)      DPRINTF(_f, ##_a)
  #define ERR(_s, err, _f, _a...)    DPRINTF("ERROR: %d: " _f, err, ##_a)
  #define TRACE(s)                   ((void)0)
#elif (DEBUGGING == 2)
  #define DBG(level, _f, _a...)      tlog_write(level, _f, ##_a)
  #define ERR(_s, _err, _f, _a...)   tlog_drv_error((_s)->driver, _err, _f, ##_a)
  #define TRACE(s)                   __TRACE(s)
#else
  #define DBG(level, _f, _a...)      ((void)0)
  #define ERR(_s, err, _f, _a...)    ((void)0)
  #define TRACE(s)                   ((void)0)
#endif

/******VHD DEFINES******/
#define VHD_CACHE_SIZE               32

#define VHD_REQS_DATA                TAPDISK_DATA_REQUESTS
#define VHD_REQS_META                (VHD_CACHE_SIZE + 2)
#define VHD_REQS_TOTAL               (VHD_REQS_DATA + VHD_REQS_META)

#define VHD_OP_BAT_WRITE             0
#define VHD_OP_DATA_READ             1
#define VHD_OP_DATA_WRITE            2
#define VHD_OP_BITMAP_READ           3
#define VHD_OP_BITMAP_WRITE          4
#define VHD_OP_ZERO_BM_WRITE         5
#define VHD_OP_REDUNDANT_BM_WRITE    6

#define VHD_BM_BAT_LOCKED            0
#define VHD_BM_BAT_CLEAR             1
#define VHD_BM_BIT_CLEAR             2
#define VHD_BM_BIT_SET               3
#define VHD_BM_NOT_CACHED            4
#define VHD_BM_READ_PENDING          5

#define VHD_FLAG_OPEN_RDONLY         1
#define VHD_FLAG_OPEN_NO_CACHE       2
#define VHD_FLAG_OPEN_QUIET          4
#define VHD_FLAG_OPEN_STRICT         8
#define VHD_FLAG_OPEN_QUERY          16
#define VHD_FLAG_OPEN_PREALLOCATE    32
#define VHD_FLAG_OPEN_NO_O_DIRECT    64
#define VHD_FLAG_OPEN_LOCAL_CACHE    128

#define VHD_FLAG_BAT_LOCKED          1
#define VHD_FLAG_BAT_WRITE_STARTED   2

#define VHD_FLAG_BM_UPDATE_BAT       1
#define VHD_FLAG_BM_WRITE_PENDING    2
#define VHD_FLAG_BM_READ_PENDING     4
#define VHD_FLAG_BM_LOCKED           8

#define VHD_FLAG_REQ_UPDATE_BAT      1
#define VHD_FLAG_REQ_UPDATE_BITMAP   2
#define VHD_FLAG_REQ_QUEUED          4
#define VHD_FLAG_REQ_FINISHED        8

#define VHD_FLAG_TX_LIVE             1
#define VHD_FLAG_TX_UPDATE_BAT       2

typedef uint8_t vhd_flag_t;

struct vhd_state;
struct vhd_request;

struct vhd_req_list {
	struct vhd_request       *head;
	struct vhd_request       *tail;
};

struct vhd_transaction {
	int                       error;
	int                       closed;
	int                       started;
	int                       finished;
	vhd_flag_t                status;
	struct vhd_req_list       requests;
};

struct vhd_request {
	int                       error;
	uint8_t                   op;
	vhd_flag_t                flags;
	td_request_t              treq;
	struct tiocb              tiocb;
	struct vhd_state         *state;
	struct vhd_request       *next;
	struct vhd_transaction   *tx;
};

struct vhd_bat_state {
	vhd_bat_t                 bat;
	vhd_batmap_t              batmap;
	vhd_flag_t                status;
	uint32_t                  pbw_blk;     /* blk num of pending write */
	uint64_t                  pbw_offset;  /* file offset of same */
	struct vhd_request        req;         /* for writing bat table */
	struct vhd_request        zero_req;    /* for initializing bitmaps */
	char                     *bat_buf;
};

struct vhd_bitmap {
	uint32_t                  blk;
	uint64_t                  seqno;       /* lru sequence number */
	vhd_flag_t                status;

	char                     *map;         /* map should only be modified
					        * in finish_bitmap_write */
	char                     *shadow;      /* in-memory bitmap changes are 
					        * made to shadow and copied to
					        * map only after having been
					        * flushed to disk */
	struct vhd_transaction    tx;          /* transaction data structure
						* encapsulating data, bitmap, 
						* and bat writes */
	struct vhd_req_list       queue;       /* data writes waiting for next
						* transaction */
	struct vhd_req_list       waiting;     /* pending requests that cannot
					        * be serviced until this bitmap
					        * is read from disk */
	struct vhd_request        req;
};

struct vhd_state {
	vhd_flag_t                flags;

        /* VHD stuff */
	vhd_context_t             vhd;
	uint32_t                  spp;         /* sectors per page */
	uint32_t                  spb;         /* sectors per block */
	uint64_t                  first_db;    /* pointer to datablock 0 */

	/**
	 * Pointer to the next (unallocated) datablock. If greater than UINT_MAX,
	 * there are no more blocks available.
	 */
	uint64_t                  next_db;

	struct vhd_bat_state      bat;

	uint64_t                  bm_lru;      /* lru sequence number */
	uint32_t                  bm_secs;     /* size of bitmap, in sectors */
	struct vhd_bitmap        *bitmap[VHD_CACHE_SIZE];

	int                       bm_free_count;
	struct vhd_bitmap        *bitmap_free[VHD_CACHE_SIZE];
	struct vhd_bitmap         bitmap_list[VHD_CACHE_SIZE];

	int                       vreq_free_count;
	struct vhd_request       *vreq_free[VHD_REQS_DATA];
	struct vhd_request        vreq_list[VHD_REQS_DATA];

	/* for redundant bitmap writes */
	int                       padbm_size;
	char                     *padbm_buf;
	long int                  debug_skipped_redundant_writes;
	long int                  debug_done_redundant_writes;

	td_driver_t              *driver;

	uint64_t                  queued;
	uint64_t                  completed;
	uint64_t                  returned;
	uint64_t                  reads;
	uint64_t                  read_size;
	uint64_t                  writes;
	uint64_t                  write_size;
};

#define test_vhd_flag(word, flag)  ((word) & (flag))
#define set_vhd_flag(word, flag)   ((word) |= (flag))
#define clear_vhd_flag(word, flag) ((word) &= ~(flag))

#define bat_entry(s, blk)          ((s)->bat.bat.bat[(blk)])

static void vhd_complete(void *, struct tiocb *, int);
static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);

static struct vhd_state  *_vhd_master;
static unsigned long      _vhd_zsize;
static char              *_vhd_zeros = NULL;
int                       _dev_zero = -1;

static int
vhd_initialize(struct vhd_state *s)
{
	int err;

	if (_vhd_zeros)
		return 0;

	_vhd_zsize = 2 * getpagesize();
	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
		_vhd_zsize += VHD_BLOCK_SIZE;

	_dev_zero = open("/dev/zero", O_RDONLY);
	if (unlikely(_dev_zero == -1)) {
		err = errno;
		EPRINTF("failed to open /dev/zero: %s\n", strerror(err));
		return -err;
	}

	_vhd_zeros = mmap(NULL, _vhd_zsize, PROT_READ,
			  MAP_SHARED, _dev_zero, 0);
	if (_vhd_zeros == MAP_FAILED) {
		int _err;
		err = errno;
		EPRINTF("vhd_initialize failed: %s\n", strerror(err));
		_vhd_zeros = NULL;
		_vhd_zsize = 0;
		_err = close(_dev_zero);
		if (unlikely(_err == -1))
			EPRINTF("failed to close /dev/zero: %s (error ignored)\n",
					strerror(errno));
		else
			_dev_zero = -1;

		return -err;
	}

	_vhd_master = s;
	return 0;
}

static void
vhd_free(struct vhd_state *s)
{
	if (_vhd_master != s || !_vhd_zeros)
		return;

	free(s->padbm_buf);
	munmap(_vhd_zeros, _vhd_zsize);
	_vhd_zsize  = 0;
	_vhd_zeros  = NULL;
	_vhd_master = NULL;
	if (_dev_zero != -1) {
		int _err = close(_dev_zero);
		if (unlikely(_err == -1))
			EPRINTF("failed to close /dev/zero: %s (error ignored)\n",
					strerror(errno));
		else
			_dev_zero = -1;
	}
}

static char *
_get_vhd_zeros(const char *func, unsigned long size)
{
	if (!_vhd_zeros || _vhd_zsize < size) {
		EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
			func, size, _vhd_zsize, _vhd_zeros);
		ASSERT(0);
	}

	return _vhd_zeros;
}

#define vhd_zeros(size)	_get_vhd_zeros(__func__, size)

static inline void
set_batmap(struct vhd_state *s, uint32_t blk)
{
	if (s->bat.batmap.map) {
		vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
		DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
	}
}

static inline int
test_batmap(struct vhd_state *s, uint32_t blk)
{
	if (!s->bat.batmap.map)
		return 0;
	return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
}

static int
vhd_kill_footer(struct vhd_state *s)
{
	int err;
	off64_t end;
	void *zeros;

	if (s->vhd.footer.type == HD_TYPE_FIXED)
		return 0;

	err = posix_memalign(&zeros, 512, 512);
	if (err)
		return -err;

	err = 1;
	memset(zeros, 0xc7c7c7c7, 512);

	if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1)
		goto fail;

	if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1)
		goto fail;

	if (write(s->vhd.fd, zeros, 512) != 512)
		goto fail;

	err = 0;

 fail:
	free(zeros);
	if (err)
		return (errno ? -errno : -EIO);
	return 0;
}

static inline int
find_next_free_block(struct vhd_state *s)
{
	int err;
	off64_t eom;
	uint32_t i, entry;

	err = vhd_end_of_headers(&s->vhd, &eom);
	if (err)
		return err;

	s->next_db = secs_round_up(eom);
	s->first_db = s->next_db;
	if ((s->first_db + s->bm_secs) % s->spp)
		s->first_db += (s->spp - ((s->first_db + s->bm_secs) % s->spp));

	for (i = 0; i < s->bat.bat.entries; i++) {
		entry = bat_entry(s, i);
		if (entry != DD_BLK_UNUSED && entry >= s->next_db)
			s->next_db = (uint64_t)entry + (uint64_t)s->spb
				+ (uint64_t)s->bm_secs;
			if (s->next_db > UINT_MAX)
				break;
	}

	return 0;
}

static void
vhd_free_bat(struct vhd_state *s)
{
	free(s->bat.bat.bat);
	free(s->bat.batmap.map);
	free(s->bat.bat_buf);
	memset(&s->bat, 0, sizeof(struct vhd_bat));
}

static int
vhd_initialize_bat(struct vhd_state *s)
{
	int err, batmap_required, i;
	void *buf;

	memset(&s->bat, 0, sizeof(struct vhd_bat));

	err = vhd_read_bat(&s->vhd, &s->bat.bat);
	if (err) {
		EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
		return err;
	}

	batmap_required = 1;
	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
		batmap_required = 0;
	} else {
		err = find_next_free_block(s);
		if (err)
			goto fail;
	}

	if (vhd_has_batmap(&s->vhd)) {
		for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
			err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
			if (err) {
				EPRINTF("%s: reading batmap: %d\n",
						s->vhd.file, err);
				if (batmap_required)
					goto fail;
			} else {
				break;
			}
		}
		if (err)
			EPRINTF("%s: ignoring non-critical batmap error\n",
					s->vhd.file);
	}

	err = posix_memalign(&buf, VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
	if (err)
		goto fail;

	s->bat.bat_buf = buf;

	return 0;

fail:
	vhd_free_bat(s);
	return err;
}

static void
vhd_free_bitmap_cache(struct vhd_state *s)
{
	int i;
	struct vhd_bitmap *bm;

	for (i = 0; i < VHD_CACHE_SIZE; i++) {
		bm = s->bitmap_list + i;
		free(bm->map);
		free(bm->shadow);
		s->bitmap_free[i] = NULL;
	}

	memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
}

static int
vhd_initialize_bitmap_cache(struct vhd_state *s)
{
	int i, err, map_size;
	struct vhd_bitmap *bm;
	void *map, *shadow;

	memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);

	s->bm_lru        = 0;
	map_size         = vhd_sectors_to_bytes(s->bm_secs);
	s->bm_free_count = VHD_CACHE_SIZE;

	for (i = 0; i < VHD_CACHE_SIZE; i++) {
		bm = s->bitmap_list + i;

		err = posix_memalign(&map, 512, map_size);
		if (err)
			goto fail;

		bm->map = map;

		err = posix_memalign(&shadow, 512, map_size);
		if (err)
			goto fail;

		bm->shadow = shadow;

		memset(bm->map, 0, map_size);
		memset(bm->shadow, 0, map_size);
		s->bitmap_free[i] = bm;
	}

	return 0;

fail:
	vhd_free_bitmap_cache(s);
	return err;
}

static int
vhd_initialize_dynamic_disk(struct vhd_state *s)
{
	uint32_t bm_size;
	void *buf;
	int err;

	err = vhd_get_header(&s->vhd);
	if (err) {
		if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
			EPRINTF("Error reading VHD DD header.\n");
		return err;
	}

	if (s->vhd.header.hdr_ver != 0x00010000) {
		EPRINTF("unsupported header version! (0x%x)\n",
			s->vhd.header.hdr_ver);
		return -EINVAL;
	}

	s->spp     = getpagesize() >> VHD_SECTOR_SHIFT;
	s->spb     = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
	s->bm_secs = secs_round_up_no_zero(s->spb >> 3);

	s->padbm_size = (s->bm_secs / getpagesize()) * getpagesize();
	if (s->bm_secs % getpagesize())
		s->padbm_size += getpagesize();

	err = posix_memalign(&buf, 512, s->padbm_size);
	if (err)
		return -err;

	s->padbm_buf = buf;
	bm_size = s->bm_secs << VHD_SECTOR_SHIFT;
	memset(s->padbm_buf, 0, s->padbm_size - bm_size);
	memset(s->padbm_buf + (s->padbm_size - bm_size), ~0, bm_size);
	s->debug_skipped_redundant_writes = 0;
	s->debug_done_redundant_writes = 0;

	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
		return 0;

	err = vhd_initialize_bat(s);
	if (err)
		return err;

	err = vhd_initialize_bitmap_cache(s);
	if (err) {
		vhd_free_bat(s);
		return err;
	}

	return 0;
}

static int
vhd_check_version(struct vhd_state *s)
{
	if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
		return 0;

	if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
		if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
			EPRINTF("WARNING: %s vhd creator version 0x%08x, "
				"but only versions up to 0x%08x are "
				"supported for IO\n", s->vhd.file,
				s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);

		return -EINVAL;
	}

	return 0;
}

static void
vhd_log_open(struct vhd_state *s)
{
	char buf[5];
	uint32_t i, allocated, full;

	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
		return;

	snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
	if (!vhd_type_dynamic(&s->vhd)) {
		DPRINTF("%s version: %s 0x%08x\n",
			s->vhd.file, buf, s->vhd.footer.crtr_ver);
		return;
	}

	allocated = 0;
	full      = 0;

	for (i = 0; i < s->bat.bat.entries; i++) {
		if (bat_entry(s, i) != DD_BLK_UNUSED)
			allocated++;
		if (test_batmap(s, i))
			full++;
	}

	DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n",
		s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries,
		allocated, full, s->next_db);
}

static int
__vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags)
{
        int i, o_flags, err;
	struct vhd_state *s;

        DBG(TLOG_INFO, "vhd_open: %s\n", name);
	if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
		libvhd_set_log_level(1);

	s = (struct vhd_state *)driver->data;
	memset(s, 0, sizeof(struct vhd_state));

	s->flags  = flags;
	s->driver = driver;

	err = vhd_initialize(s);
	if (err)
		return err;

	o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ? 
		   VHD_OPEN_RDONLY : VHD_OPEN_RDWR);
	if ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY) ||
                test_vhd_flag(flags, VHD_FLAG_OPEN_LOCAL_CACHE)) &&
	    test_vhd_flag(flags, VHD_FLAG_OPEN_NO_O_DIRECT))
		set_vhd_flag(o_flags, VHD_OPEN_CACHED);

	if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
		set_vhd_flag(o_flags, VHD_OPEN_STRICT);

	err = vhd_open(&s->vhd, name, o_flags);
	if (err) {
		libvhd_set_log_level(1);
		err = vhd_open(&s->vhd, name, o_flags);
		if (err) {
			EPRINTF("Unable to open [%s] (%d)!\n", name, err);
			return err;
		}
	}

	err = vhd_check_version(s);
	if (err)
		goto fail;

	s->spb = s->spp = 1;

	if (vhd_type_dynamic(&s->vhd)) {
		err = vhd_initialize_dynamic_disk(s);
		if (err)
			goto fail;
	}

	vhd_log_open(s);

	SPB = s->spb;

	s->vreq_free_count = VHD_REQS_DATA;
	for (i = 0; i < VHD_REQS_DATA; i++)
		s->vreq_free[i] = s->vreq_list + i;

	driver->info.size        = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT;
	driver->info.sector_size = VHD_SECTOR_SIZE;
	driver->info.info        = 0;

        DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%lu, inf:%u)\n",
	    driver->info.size, driver->info.sector_size, driver->info.info);

	if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) && 
	    !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) {
		err = vhd_kill_footer(s);
		if (err) {
			DPRINTF("ERROR killing footer: %d\n", err);
			goto fail;
		}
		s->writes++;
	}

        return 0;

 fail:
	vhd_free_bat(s);
	vhd_free_bitmap_cache(s);
	vhd_close(&s->vhd);
	vhd_free(s);
	return err;
}

static int
_vhd_open(td_driver_t *driver, const char *name, td_flag_t flags)
{
	vhd_flag_t vhd_flags = 0;

	if (flags & TD_OPEN_RDONLY)
		vhd_flags |= VHD_FLAG_OPEN_RDONLY;
	if (flags & TD_OPEN_NO_O_DIRECT)
		vhd_flags |= VHD_FLAG_OPEN_NO_O_DIRECT;
	if (flags & TD_OPEN_QUIET)
		vhd_flags |= VHD_FLAG_OPEN_QUIET;
	if (flags & TD_OPEN_STRICT)
		vhd_flags |= VHD_FLAG_OPEN_STRICT;
	if (flags & TD_OPEN_QUERY)
		vhd_flags |= (VHD_FLAG_OPEN_QUERY  |
			      VHD_FLAG_OPEN_QUIET  |
			      VHD_FLAG_OPEN_RDONLY |
			      VHD_FLAG_OPEN_NO_CACHE);
    if (flags & TD_OPEN_LOCAL_CACHE)
        vhd_flags |= VHD_FLAG_OPEN_LOCAL_CACHE;

	/* pre-allocate for all but NFS and LVM storage */
	driver->storage = tapdisk_storage_type(name);

	if (driver->storage != TAPDISK_STORAGE_TYPE_NFS &&
	    driver->storage != TAPDISK_STORAGE_TYPE_LVM)
		vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE;

	return __vhd_open(driver, name, vhd_flags);
}

static void
vhd_log_close(struct vhd_state *s)
{
	uint32_t i, allocated, full;

	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
		return;

	allocated = 0;
	full      = 0;

	for (i = 0; i < s->bat.bat.entries; i++) {
		if (bat_entry(s, i) != DD_BLK_UNUSED)
			allocated++;
		if (test_batmap(s, i))
			full++;
	}

	DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n",
		s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db);
}

static int
_vhd_close(td_driver_t *driver)
{
	int err;
	struct vhd_state *s;
	
	DBG(TLOG_WARN, "vhd_close\n");
	s = (struct vhd_state *)driver->data;

	DPRINTF("gaps written/skipped: %ld/%ld\n", 
			s->debug_done_redundant_writes,
			s->debug_skipped_redundant_writes);

	/* don't write footer if tapdisk is read-only */
	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY))
		goto free;
	
	/* 
	 * write footer if:
	 *   - we killed it on open (opened with strict) 
	 *   - we've written data since opening
	 */
	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) {
		memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t));
		err = vhd_write_footer(&s->vhd, &s->vhd.footer);
		memset(&s->vhd.bat, 0, sizeof(vhd_bat_t));

		if (err)
			EPRINTF("writing %s footer: %d\n", s->vhd.file, err);

		if (!vhd_has_batmap(&s->vhd))
			goto free;

		err = vhd_write_batmap(&s->vhd, &s->bat.batmap);
		if (err)
			EPRINTF("writing %s batmap: %d\n", s->vhd.file, err);
	}

 free:
	vhd_log_close(s);
	vhd_free_bat(s);
	vhd_free_bitmap_cache(s);
	vhd_close(&s->vhd);
	vhd_free(s);

	memset(s, 0, sizeof(struct vhd_state));

	return 0;
}

int
vhd_validate_parent(td_driver_t *child_driver,
		    td_driver_t *parent_driver, td_flag_t flags)
{
	struct vhd_state *child  = (struct vhd_state *)child_driver->data;
	struct vhd_state *parent;

	if (parent_driver->type != DISK_TYPE_VHD) {
		if (child_driver->type != DISK_TYPE_VHD)
			return -EINVAL;
		if (child->vhd.footer.type != HD_TYPE_DIFF)
			return -EINVAL;
		if (!vhd_parent_raw(&child->vhd))
			return -EINVAL;
		return 0;
	}

	parent = (struct vhd_state *)parent_driver->data;

	/* 
	 * This check removed because of cases like:
	 *   - parent VHD marked as 'hidden'
	 *   - parent VHD modified during coalesce
	 */
	/*
	if (stat(parent->vhd.file, &stats)) {
		DPRINTF("ERROR stating parent file %s\n", parent->vhd.file);
		return -errno;
	}

	if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) {
		DPRINTF("ERROR: parent file has been modified since "
			"snapshot.  Child image no longer valid.\n");
		return -EINVAL;
	}
	*/

	if (uuid_compare(child->vhd.header.prt_uuid, parent->vhd.footer.uuid)) {
		DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since "
			"snapshot.  Child image no longer valid.\n",
			__func__, child->vhd.file, parent->vhd.file);
		return -EINVAL;
	}

	/* TODO: compare sizes */
	
	return 0;
}

int
vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
{
	int err;
	char *parent;
	struct vhd_state *s;
	int flags;

	DBG(TLOG_DBG, "\n");
	flags = id->flags;
	memset(id, 0, sizeof(td_disk_id_t));

	s = (struct vhd_state *)driver->data;

	if (s->vhd.footer.type != HD_TYPE_DIFF)
		return TD_NO_PARENT;

	err = vhd_parent_locator_get(&s->vhd, &parent);
	if (err)
		return err;

	id->name   = parent;
	id->type   = vhd_parent_raw(&s->vhd) ? DISK_TYPE_AIO : DISK_TYPE_VHD;
	id->flags  = flags|TD_OPEN_SHAREABLE|TD_OPEN_RDONLY;

	return 0;
}

static inline void
clear_req_list(struct vhd_req_list *list)
{
	list->head = list->tail = NULL;
}

static inline void
add_to_tail(struct vhd_req_list *list, struct vhd_request *e)
{
	if (!list->head) 
		list->head = list->tail = e;
	else 
		list->tail = list->tail->next = e;
}

static inline int
remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e)
{
	struct vhd_request *i = list->head;

	if (list->head == e) {
		if (list->tail == e)
			clear_req_list(list);
		else
			list->head = list->head->next;
		return 0;
	}

	while (i->next) {
		if (i->next == e) {
			if (list->tail == e) {
				i->next = NULL;
				list->tail = i;
			} else
				i->next = i->next->next;
			return 0;
		}
		i = i->next;
	}

	return -EINVAL;
}

static inline void
init_vhd_request(struct vhd_state *s, struct vhd_request *req)
{
	memset(req, 0, sizeof(struct vhd_request));
	req->state = s;
}

static inline void
init_tx(struct vhd_transaction *tx)
{
	memset(tx, 0, sizeof(struct vhd_transaction));
}

static inline void
add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r)
{
	ASSERT(!tx->closed);

	r->tx = tx;
	tx->started++;
	add_to_tail(&tx->requests, r);
	set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE);

	DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, "
	    "started: %d, finished: %d, status: %u\n",
	    r->treq.sec / SPB, r->treq.sec, tx,
	    tx->started, tx->finished, tx->status);
}

static inline int
transaction_completed(struct vhd_transaction *tx)
{
	return (tx->started == tx->finished);
}

static inline void
init_bat(struct vhd_state *s)
{
	s->bat.req.tx     = NULL;
	s->bat.req.next   = NULL;
	s->bat.req.error  = 0;
	s->bat.pbw_blk    = 0;
	s->bat.pbw_offset = 0;
	s->bat.status     = 0;
}

static inline void
lock_bat(struct vhd_state *s)
{
	set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
}

static inline void
unlock_bat(struct vhd_state *s)
{
	clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
}

static inline int
bat_locked(struct vhd_state *s)
{
	return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
}

static inline void
init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
{
	bm->blk    = 0;
	bm->seqno  = 0;
	bm->status = 0;
	init_tx(&bm->tx);
	clear_req_list(&bm->queue);
	clear_req_list(&bm->waiting);
	memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs));
	memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs));
	init_vhd_request(s, &bm->req);
}

static inline struct vhd_bitmap *
get_bitmap(struct vhd_state *s, uint32_t block)
{
	int i;
	struct vhd_bitmap *bm;

	for (i = 0; i < VHD_CACHE_SIZE; i++) {
		bm = s->bitmap[i];
		if (bm && bm->blk == block)
			return bm;
	}

	return NULL;
}

static inline void
lock_bitmap(struct vhd_bitmap *bm)
{
	set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
}

static inline void
unlock_bitmap(struct vhd_bitmap *bm)
{
	clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
}

static inline int
bitmap_locked(struct vhd_bitmap *bm)
{
	return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
}

static inline int
bitmap_valid(struct vhd_bitmap *bm)
{
	return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
}

static inline int
bitmap_in_use(struct vhd_bitmap *bm)
{
	return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)  ||
		test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) ||
		test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) ||
		bm->waiting.head || bm->tx.requests.head || bm->queue.head);
}

static inline int
bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm)
{
	int i, n;

	n = s->spb >> 3;
	for (i = 0; i < n; i++)
		if (bm->map[i] != (char)0xFF)
			return 0;

	DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk);
	return 1;
}

static struct vhd_bitmap *
remove_lru_bitmap(struct vhd_state *s)
{
	int i, idx = 0;
	uint64_t seq = s->bm_lru;
	struct vhd_bitmap *bm, *lru = NULL;

	for (i = 0; i < VHD_CACHE_SIZE; i++) {
		bm = s->bitmap[i];
		if (bm && bm->seqno < seq && !bitmap_locked(bm)) {
			idx = i;
			lru = bm;
			seq = lru->seqno;
		}
	}

	if (lru) {
		s->bitmap[idx] = NULL;
		ASSERT(!bitmap_in_use(lru));
	}

	return  lru;
}

static int
alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk)
{
	struct vhd_bitmap *bm;
	
	*bitmap = NULL;

	if (s->bm_free_count > 0) {
		bm = s->bitmap_free[--s->bm_free_count];
	} else {
		bm = remove_lru_bitmap(s);
		if (!bm)
			return -EBUSY;
	}

	init_vhd_bitmap(s, bm);
	bm->blk = blk;
	*bitmap = bm;

	return 0;
}

static inline uint64_t
__bitmap_lru_seqno(struct vhd_state *s)
{
	int i;
	struct vhd_bitmap *bm;

	if (s->bm_lru == 0xffffffff) {
		s->bm_lru = 0;
		for (i = 0; i < VHD_CACHE_SIZE; i++) {
			bm = s->bitmap[i];
			if (bm) {
				bm->seqno >>= 1;
				if (bm->seqno > s->bm_lru)
					s->bm_lru = bm->seqno;
			}
		}
	}

	return ++s->bm_lru;
}

static inline void
touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
{
	bm->seqno = __bitmap_lru_seqno(s);
}

static inline void
install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
{
	int i;
	for (i = 0; i < VHD_CACHE_SIZE; i++) {
		if (!s->bitmap[i]) {
			touch_bitmap(s, bm);
			s->bitmap[i] = bm;
			return;
		}
	}

	ASSERT(0);
}

static inline void
free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
{
	int i;

	for (i = 0; i < VHD_CACHE_SIZE; i++)
		if (s->bitmap[i] == bm)
			break;

	ASSERT(!bitmap_locked(bm));
	ASSERT(!bitmap_in_use(bm));
	ASSERT(i < VHD_CACHE_SIZE);

	s->bitmap[i] = NULL;
	s->bitmap_free[s->bm_free_count++] = bm;
}

static int
read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op)
{
	uint32_t blk, sec;
	struct vhd_bitmap *bm;

	/* in fixed disks, every block is present */
	if (s->vhd.footer.type == HD_TYPE_FIXED) 
		return VHD_BM_BIT_SET;

	/* the extent the logical sector falls in */
	blk = sector / s->spb;

	/* offset within the extent the logical sector is located */
	sec = sector % s->spb;

	if (blk > s->vhd.header.max_bat_size) {
		DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n",
			sector, op);
		return -EINVAL;
	}

	if (bat_entry(s, blk) == DD_BLK_UNUSED) {
		if (op == VHD_OP_DATA_WRITE &&
		    s->bat.pbw_blk != blk && bat_locked(s))
			return VHD_BM_BAT_LOCKED;

		return VHD_BM_BAT_CLEAR;
	}

	if (test_batmap(s, blk)) {
		DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk);
		return VHD_BM_BIT_SET;
	}

	bm = get_bitmap(s, blk);
	if (!bm)
		return VHD_BM_NOT_CACHED;

	/* bump lru count */
	touch_bitmap(s, bm);

	if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING))
		return VHD_BM_READ_PENDING;

	return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ? 
		VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR);
}

static int
read_bitmap_cache_span(struct vhd_state *s, 
		       uint64_t sector, int nr_secs, int value)
{
	int ret;
	uint32_t blk, sec;
	struct vhd_bitmap *bm;

	/* in fixed disks, every block is present */
	if (s->vhd.footer.type == HD_TYPE_FIXED) 
		return nr_secs;

	sec = sector % s->spb;
	blk = sector / s->spb;

	if (test_batmap(s, blk))
		return MIN(nr_secs, s->spb - sec);

	bm  = get_bitmap(s, blk);
	
	ASSERT(bm && bitmap_valid(bm));

	for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++)
		if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value)
			break;

	return ret;
}

static inline struct vhd_request *
alloc_vhd_request(struct vhd_state *s)
{
	struct vhd_request *req = NULL;
	
	if (s->vreq_free_count > 0) {
		req = s->vreq_free[--s->vreq_free_count];
		ASSERT(req->treq.secs == 0);
		init_vhd_request(s, req);
		return req;
	}

	return NULL;
}

static inline void
free_vhd_request(struct vhd_state *s, struct vhd_request *req)
{
	memset(req, 0, sizeof(struct vhd_request));
	s->vreq_free[s->vreq_free_count++] = req;
}

static inline void
aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
{
	struct tiocb *tiocb = &req->tiocb;

	td_prep_read(tiocb, s->vhd.fd, req->treq.buf,
		     vhd_sectors_to_bytes(req->treq.secs),
		     offset, vhd_complete, req);
	td_queue_tiocb(s->driver, tiocb);

	s->queued++;
	s->reads++;
	s->read_size += req->treq.secs;
	TRACE(s);
}

static inline void
aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
{
	struct tiocb *tiocb = &req->tiocb;

	td_prep_write(tiocb, s->vhd.fd, req->treq.buf,
		      vhd_sectors_to_bytes(req->treq.secs),
		      offset, vhd_complete, req);
	td_queue_tiocb(s->driver, tiocb);

	s->queued++;
	s->writes++;
	s->write_size += req->treq.secs;
	TRACE(s);
}

/**
 * Reserves a new extent.
 *
 * @returns a 64-bit unsigned integer where the error code is stored in the
 * upper 32 bits and the reserved block number is stored in the lower 32 bits.
 * If an error is returned (the upper 32 bits are not zero), the lower 32 bits
 * are undefined.
 */
static inline uint64_t
reserve_new_block(struct vhd_state *s, uint32_t blk)
{
	int gap = 0;

	ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));

	/* data region of segment should begin on page boundary */
	if ((s->next_db + s->bm_secs) % s->spp)
		gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));

	if (s->next_db + gap > UINT_MAX)
		return (uint64_t)ENOSPC << 32;

	s->bat.pbw_blk    = blk;
	s->bat.pbw_offset = s->next_db + gap;

	return s->next_db;
}

static int
schedule_bat_write(struct vhd_state *s)
{
	int i;
	uint32_t blk;
	char *buf;
	uint64_t offset;
	struct vhd_request *req;

	ASSERT(bat_locked(s));

	req = &s->bat.req;
	buf = s->bat.bat_buf;
	blk = s->bat.pbw_blk;

	init_vhd_request(s, req);
	memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512);

	((uint32_t *)buf)[blk % 128] = s->bat.pbw_offset;

	for (i = 0; i < 128; i++)
		BE32_OUT(&((uint32_t *)buf)[i]);

	offset         = s->vhd.header.table_offset + (blk - (blk % 128)) * 4;
	req->treq.secs = 1;
	req->treq.buf  = buf;
	req->op        = VHD_OP_BAT_WRITE;
	req->next      = NULL;

	aio_write(s, req, offset);
	set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED);

	DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", "
	    "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset);

	return 0;
}

static void
schedule_zero_bm_write(struct vhd_state *s,
		       struct vhd_bitmap *bm, uint64_t lb_end)
{
	uint64_t offset;
	struct vhd_request *req = &s->bat.zero_req;

	init_vhd_request(s, req);

	offset         = vhd_sectors_to_bytes(lb_end);
	req->op        = VHD_OP_ZERO_BM_WRITE;
	req->treq.sec  = s->bat.pbw_blk * s->spb;
	req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs;
	req->treq.buf  = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs));
	req->next      = NULL;

	DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n",
	    s->bat.pbw_blk, offset);

	lock_bitmap(bm);
	add_to_transaction(&bm->tx, req);
	aio_write(s, req, offset);
}

/* This is a performance optimization. When writing sequentially into full 
 * blocks, skipping (up-to-date) bitmaps causes an approx. 25% reduction in 
 * throughput. To prevent skipping, we issue redundant writes into the (padded) 
 * bitmap area just to make all writes sequential. This will help VHDs on raw 
 * block devices, while the FS-based VHDs shouldn't suffer much.
 *
 * Note that it only makes sense to perform this reduntant bitmap write if the 
 * block is completely full (i.e. the batmap entry is set). If the block is not 
 * completely full then one of the following two things will be true:
 *  1. we'll either be allocating new sectors in this block and writing its
 *     bitmap transactionally, which will be slow anyways; or
 *  2. the IO will be skipping over the unallocated sectors again, so the
 *     pattern will not be sequential anyways
 * In either case a redundant bitmap write becomes pointless. This fact 
 * simplifies the implementation of redundant writes: since we know the bitmap 
 * cannot be updated by anyone else, we don't have to worry about transactions 
 * or potential write conflicts.
 * */
static void
schedule_redundant_bm_write(struct vhd_state *s, uint32_t blk)
{
	uint64_t offset;
	struct vhd_request *req;

	ASSERT(s->vhd.footer.type != HD_TYPE_FIXED);
	ASSERT(test_batmap(s, blk));

	req = alloc_vhd_request(s);
	if (!req) 
		return;

	req->treq.buf = s->padbm_buf;

	offset = bat_entry(s, blk);
	ASSERT(offset != DD_BLK_UNUSED);
	offset <<= VHD_SECTOR_SHIFT;
	offset -= s->padbm_size - (s->bm_secs << VHD_SECTOR_SHIFT);

	req->op        = VHD_OP_REDUNDANT_BM_WRITE;
	req->treq.sec  = blk * s->spb;
	req->treq.secs = s->padbm_size >> VHD_SECTOR_SHIFT;
	req->next      = NULL;

	DBG(TLOG_DBG, "blk: %u, writing redundant bitmap at %" PRIu64 "\n",
	    blk, offset);

	aio_write(s, req, offset);
}

static int
update_bat(struct vhd_state *s, uint32_t blk)
{
	int err;
	uint64_t lb_end;
	struct vhd_bitmap *bm;

	ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
	
	if (bat_locked(s)) {
		ASSERT(s->bat.pbw_blk == blk);
		return 0;
	}

	/* empty bitmap could already be in
	 * cache if earlier bat update failed */
	bm = get_bitmap(s, blk);
	if (!bm) {
		/* install empty bitmap in cache */
		err = alloc_vhd_bitmap(s, &bm, blk);
		if (err) 
			return err;

		install_bitmap(s, bm);
	}

	lock_bat(s);
	lb_end = reserve_new_block(s, blk);
	if (lb_end >> 32) {
		unlock_bat(s);
		return -(lb_end >> 32);
	}
	schedule_zero_bm_write(s, bm, lb_end);
	set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT);

	return 0;
}

static int
allocate_block(struct vhd_state *s, uint32_t blk)
{
	int err, gap;
	uint64_t offset, size;
	struct vhd_bitmap *bm;
	ssize_t count;
	uint64_t next_db;

	ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);

	if (bat_locked(s)) {
		ASSERT(s->bat.pbw_blk == blk);
		if (s->bat.req.error)
			return -EBUSY;
		return 0;
	}

	gap     = 0;
	offset  = vhd_sectors_to_bytes(s->next_db);
	next_db = s->next_db;

	/* data region of segment should begin on page boundary */
	if ((next_db + s->bm_secs) % s->spp) {
		gap = (s->spp - ((next_db + s->bm_secs) % s->spp));
		next_db += gap;
	}

	if (next_db > UINT_MAX)
		return -ENOSPC;

	s->next_db = next_db;

	s->bat.pbw_blk = blk;
	s->bat.pbw_offset = s->next_db;

	DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n",
	    blk, s->bat.pbw_offset);

	if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) {
		ERR(s, -errno, "lseek failed\n");
		return -errno;
	}

	size  = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap);
	count = write(s->vhd.fd, vhd_zeros(size), size);
	if (count != size) {
		err = count < 0 ? -errno : -ENOSPC;
		ERR(s, -errno,
		    "write failed (%zd, offset %"PRIu64")\n", count, offset);
		return err;
	}

	/* empty bitmap could already be in
	 * cache if earlier bat update failed */
	bm = get_bitmap(s, blk);
	if (!bm) {
		/* install empty bitmap in cache */
		err = alloc_vhd_bitmap(s, &bm, blk);
		if (err) 
			return err;

		install_bitmap(s, bm);
	}

	lock_bat(s);
	lock_bitmap(bm);
	schedule_bat_write(s);
	add_to_transaction(&bm->tx, &s->bat.req);

	return 0;
}

static int 
schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
{
	uint64_t offset;
	uint32_t blk = 0, sec = 0;
	struct vhd_bitmap  *bm;
	struct vhd_request *req;

	if (s->vhd.footer.type == HD_TYPE_FIXED) {
		offset = vhd_sectors_to_bytes(treq.sec);
		goto make_request;
	}

	blk    = treq.sec / s->spb;
	sec    = treq.sec % s->spb;
	bm     = get_bitmap(s, blk);
	offset = bat_entry(s, blk);

	ASSERT(offset != DD_BLK_UNUSED);
	ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm)));

	offset += s->bm_secs + sec;
	offset  = vhd_sectors_to_bytes(offset);

 make_request:
	req = alloc_vhd_request(s);
	if (!req) 
		return -EBUSY;

	req->treq  = treq;
	req->flags = flags;
	req->op    = VHD_OP_DATA_READ;
	req->next  = NULL;

	aio_read(s, req, offset);

	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
	    "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n",
	    s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags,
	    treq.buf);

	return 0;
}

static int
schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
{
	int err;
	uint64_t offset;
	uint32_t blk = 0, sec = 0;
	struct vhd_bitmap  *bm = NULL;
	struct vhd_request *req;

	if (s->vhd.footer.type == HD_TYPE_FIXED) {
		offset = vhd_sectors_to_bytes(treq.sec);
		goto make_request;
	}

	blk    = treq.sec / s->spb;
	sec    = treq.sec % s->spb;
	offset = bat_entry(s, blk);

	if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) {
		if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
			err = allocate_block(s, blk);
		else
			err = update_bat(s, blk);

		if (err)
			return err;

		offset = s->bat.pbw_offset;
	}

	offset += s->bm_secs + sec;
	offset  = vhd_sectors_to_bytes(offset);

 make_request:
	req = alloc_vhd_request(s);
	if (!req)
		return -EBUSY;

	req->treq  = treq;
	req->flags = flags;
	req->op    = VHD_OP_DATA_WRITE;
	req->next  = NULL;

	if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) {
		bm = get_bitmap(s, blk);
		ASSERT(bm && bitmap_valid(bm));
		lock_bitmap(bm);

		if (bm->tx.closed) {
			add_to_tail(&bm->queue, req);
			set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED);
		} else
			add_to_transaction(&bm->tx, req);
	} else if (sec == 0 && 	/* first sector inside data block */
		   s->vhd.footer.type != HD_TYPE_FIXED && 
		   bat_entry(s, blk) != s->first_db &&
		   test_batmap(s, blk))
		schedule_redundant_bm_write(s, blk);

	aio_write(s, req, offset);

	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
	    "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n",
	    s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags);

	return 0;
}

static int 
schedule_bitmap_read(struct vhd_state *s, uint32_t blk)
{
	int err;
	uint64_t offset;
	struct vhd_bitmap  *bm;
	struct vhd_request *req = NULL;

	ASSERT(vhd_type_dynamic(&s->vhd));

	offset = bat_entry(s, blk);

	ASSERT(offset != DD_BLK_UNUSED);
	ASSERT(!get_bitmap(s, blk));

	offset = vhd_sectors_to_bytes(offset);

	err = alloc_vhd_bitmap(s, &bm, blk);
	if (err)
		return err;

	req = &bm->req;
	init_vhd_request(s, req);

	req->treq.sec  = blk * s->spb;
	req->treq.secs = s->bm_secs;
	req->treq.buf  = bm->map;
	req->treq.cb   = NULL;
	req->op        = VHD_OP_BITMAP_READ;
	req->next      = NULL;

	aio_read(s, req, offset);
	lock_bitmap(bm);
	install_bitmap(s, bm);
	set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);

	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, "
	    "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk,
	    req->treq.secs, offset);

	return 0;
}

static void
schedule_bitmap_write(struct vhd_state *s, uint32_t blk)
{
	uint64_t offset;
	struct vhd_bitmap  *bm;
	struct vhd_request *req;

	bm     = get_bitmap(s, blk);
	offset = bat_entry(s, blk);

	ASSERT(vhd_type_dynamic(&s->vhd));
	ASSERT(bm && bitmap_valid(bm) &&
	       !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));

	if (offset == DD_BLK_UNUSED) {
		ASSERT(bat_locked(s) && s->bat.pbw_blk == blk);
		offset = s->bat.pbw_offset;
	}
	
	offset = vhd_sectors_to_bytes(offset);

	req = &bm->req;
	init_vhd_request(s, req);

	req->treq.sec  = blk * s->spb;
	req->treq.secs = s->bm_secs;
	req->treq.buf  = bm->shadow;
	req->treq.cb   = NULL;
	req->op        = VHD_OP_BITMAP_WRITE;
	req->next      = NULL;

	aio_write(s, req, offset);
	lock_bitmap(bm);
	touch_bitmap(s, bm);     /* bump lru count */
	set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);

	DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, "
	    "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec,
	    req->treq.secs, offset);
}

/* 
 * queued requests will be submitted once the bitmap
 * describing them is read and the requests are validated. 
 */
static int
__vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq)
{
	uint32_t blk;
	struct vhd_bitmap  *bm;
	struct vhd_request *req;

	ASSERT(vhd_type_dynamic(&s->vhd));

	blk = treq.sec / s->spb;
	bm  = get_bitmap(s, blk);

	ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));

	req = alloc_vhd_request(s);
	if (!req)
		return -EBUSY;

	req->treq = treq;
	req->op   = op;
	req->next = NULL;

	add_to_tail(&bm->waiting, req);
	lock_bitmap(bm);

	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, "
	    "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op);

	TRACE(s);
	return 0;
}

static void
vhd_queue_read(td_driver_t *driver, td_request_t treq)
{
	struct vhd_state *s = (struct vhd_state *)driver->data;

	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n",
	    s->vhd.file, treq.sec, treq.secs, treq.sidx);

	while (treq.secs) {
		int err;
		td_request_t clone;

		err   = 0;
		clone = treq;

		switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) {
		case -EINVAL:
			err = -EINVAL;
			goto fail;

		case VHD_BM_BAT_CLEAR:
			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
			td_forward_request(clone);
			break;

		case VHD_BM_BIT_CLEAR:
			clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
			td_forward_request(clone);
			break;

		case VHD_BM_BIT_SET:
			clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
			err = schedule_data_read(s, clone, 0);
			if (err)
				goto fail;
			break;

		case VHD_BM_NOT_CACHED:
			err = schedule_bitmap_read(s, clone.sec / s->spb);
			if (err)
				goto fail;

			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
			err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
			if (err)
				goto fail;
			break;

		case VHD_BM_READ_PENDING:
			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
			err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
			if (err)
				goto fail;
			break;

		case VHD_BM_BAT_LOCKED:
		default:
			ASSERT(0);
			break;
		}

		treq.sec  += clone.secs;
		treq.secs -= clone.secs;
		treq.buf  += vhd_sectors_to_bytes(clone.secs);
		continue;

	fail:
		clone.secs = treq.secs;
		td_complete_request(clone, err);
		break;
	}
}

static void
vhd_queue_write(td_driver_t *driver, td_request_t treq)
{
	struct vhd_state *s = (struct vhd_state *)driver->data;

	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n",
	    s->vhd.file, treq.sec, treq.secs, treq.sidx);

	while (treq.secs) {
		int err;
		uint8_t flags;
		td_request_t clone;

		err   = 0;
		flags = 0;
		clone = treq;

		switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) {
		case -EINVAL:
			err = -EINVAL;
			goto fail;

		case VHD_BM_BAT_LOCKED:
			err = -EBUSY;
			goto fail;

		case VHD_BM_BAT_CLEAR:
			flags      = (VHD_FLAG_REQ_UPDATE_BAT |
				      VHD_FLAG_REQ_UPDATE_BITMAP);
			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
			err        = schedule_data_write(s, clone, flags);
			if (err)
				goto fail;
			break;

		case VHD_BM_BIT_CLEAR:
			flags      = VHD_FLAG_REQ_UPDATE_BITMAP;
			clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
			err        = schedule_data_write(s, clone, flags);
			if (err)
				goto fail;
			break;

		case VHD_BM_BIT_SET:
			clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
			err = schedule_data_write(s, clone, 0);
			if (err)
				goto fail;
			break;

		case VHD_BM_NOT_CACHED:
			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
			err = schedule_bitmap_read(s, clone.sec / s->spb);
			if (err)
				goto fail;

			err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
			if (err)
				goto fail;
			break;

		case VHD_BM_READ_PENDING:
			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
			err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
			if (err)
				goto fail;
			break;

		default:
			ASSERT(0);
			break;
		}

		treq.sec  += clone.secs;
		treq.secs -= clone.secs;
		treq.buf  += vhd_sectors_to_bytes(clone.secs);
		continue;

	fail:
		clone.secs = treq.secs;
		td_complete_request(clone, err);
		break;
	}
}

static inline void
signal_completion(struct vhd_request *list, int error)
{
	struct vhd_state *s;
	struct vhd_request *r, *next;

	if (!list)
		return;

	r = list;
	s = list->state;

	while (r) {
		int err;

		err  = (error ? error : r->error);
		next = r->next;
		td_complete_request(r->treq, err);
		DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", "
		    "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err);
		free_vhd_request(s, r);
		r    = next;

		s->returned++;
		TRACE(s);
	}
}

static void
start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
{
	struct vhd_transaction *tx;
	struct vhd_request *r, *next;
	int i;

	if (!bm->queue.head)
		return;

	DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);

	r  = bm->queue.head;
	tx = &bm->tx;
	clear_req_list(&bm->queue);

	if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED)
		tx->error = -EIO;

	while (r) {
		next    = r->next;
		r->next = NULL;
		clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED);

		add_to_transaction(tx, r);
		if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) {
			tx->finished++;
			if (!r->error) {
				uint32_t sec = r->treq.sec % s->spb;
				for (i = 0; i < r->treq.secs; i++)
					vhd_bitmap_set(&s->vhd,
						       bm->shadow, sec + i);
			}
		}
		r = next;
	}

	/* perhaps all the queued writes already completed? */
	if (tx->started && transaction_completed(tx))
		finish_data_transaction(s, bm);
}

static void
finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
{
	struct vhd_transaction *tx = &bm->tx;

	if (!bat_locked(s))
		return;

	if (s->bat.pbw_blk != bm->blk)
		return;

	if (!s->bat.req.error)
		goto release;

	if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE))
		goto release;

	tx->closed = 1;
	return;

 release:
	DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
	unlock_bat(s);
	init_bat(s);
}

static void
finish_bitmap_transaction(struct vhd_state *s,
			  struct vhd_bitmap *bm, int error)
{
	int map_size;
	struct vhd_transaction *tx = &bm->tx;

	DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error);
	tx->error = (tx->error ? tx->error : error);
	map_size  = vhd_sectors_to_bytes(s->bm_secs);

	if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
		if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) {
			/* still waiting for bat write */
			ASSERT(bm->blk == s->bat.pbw_blk);
			ASSERT(test_vhd_flag(s->bat.status, 
					     VHD_FLAG_BAT_WRITE_STARTED));
			s->bat.req.tx = tx;
			return;
		}
	}

	if (tx->error) {
		/* undo changes to shadow */
		memcpy(bm->shadow, bm->map, map_size);
	} else {
		/* complete atomic write */
		memcpy(bm->map, bm->shadow, map_size);
		if (!test_batmap(s, bm->blk) && bitmap_full(s, bm))
			set_batmap(s, bm->blk);
	}

	/* transaction done; signal completions */
	signal_completion(tx->requests.head, tx->error);
	init_tx(tx);
	start_new_bitmap_transaction(s, bm);

	if (!bitmap_in_use(bm))
		unlock_bitmap(bm);

	finish_bat_transaction(s, bm);
}

static void
finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
{
	struct vhd_transaction *tx = &bm->tx;

	DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);

	tx->closed = 1;

	if (!tx->error)
		return schedule_bitmap_write(s, bm->blk);

	return finish_bitmap_transaction(s, bm, 0);
}

static void
finish_bat_write(struct vhd_request *req)
{
	struct vhd_bitmap *bm;
	struct vhd_transaction *tx;
	struct vhd_state *s = req->state;

	s->returned++;
	TRACE(s);

	bm = get_bitmap(s, s->bat.pbw_blk);

	DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n",
	    s->bat.pbw_blk, s->bat.pbw_offset, req->error);
	ASSERT(bm && bitmap_valid(bm));
	ASSERT(bat_locked(s) &&
	       test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));

	tx = &bm->tx;
	ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE));

	if (!req->error) {
		bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset;
		s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs;
	} else
		tx->error = req->error;

	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
		tx->finished++;
		remove_from_req_list(&tx->requests, req);
		if (transaction_completed(tx))
			finish_data_transaction(s, bm);
	} else {
		clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
		if (s->bat.req.tx)
			finish_bitmap_transaction(s, bm, req->error);
	}

	finish_bat_transaction(s, bm);
}

static void
finish_zero_bm_write(struct vhd_request *req)
{
	uint32_t blk;
	struct vhd_bitmap *bm;
	struct vhd_transaction *tx = req->tx;
	struct vhd_state *s = req->state;

	s->returned++;
	TRACE(s);

	blk = req->treq.sec / s->spb;
	bm  = get_bitmap(s, blk);

	DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
	ASSERT(bat_locked(s));
	ASSERT(s->bat.pbw_blk == blk);
	ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));

	tx->finished++;
	remove_from_req_list(&tx->requests, req);

	if (req->error) {
		unlock_bat(s);
		init_bat(s);
		tx->error = req->error;
		clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
	} else
		schedule_bat_write(s);

	if (transaction_completed(tx))
		finish_data_transaction(s, bm);
}

static int
finish_redundant_bm_write(struct vhd_request *req)
{
	/* uint32_t blk; */
	struct vhd_state *s = (struct vhd_state *) req->state;

	s->returned++;
	TRACE(s);	
	/* blk = req->treq.sec / s->spb;
	   DBG(TLOG_DBG, "blk: %u\n", blk); */

	if (req->error) {
		ERR(s, req->error, "lsec: 0x%08"PRIx64, req->treq.sec);
	}
	free_vhd_request(s, req);
	s->debug_done_redundant_writes++;
	return 0;
}


static void
finish_bitmap_read(struct vhd_request *req)
{
	uint32_t blk;
	struct vhd_bitmap  *bm;
	struct vhd_request *r, *next;
	struct vhd_state   *s = req->state;

	s->returned++;
	TRACE(s);

	blk = req->treq.sec / s->spb;
	bm  = get_bitmap(s, blk);

	DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
	ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));

	r = bm->waiting.head;
	clear_req_list(&bm->waiting);
	clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);

	if (!req->error) {
		memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs));

		while (r) {
			struct vhd_request tmp;

			tmp  = *r;
			next =  r->next;
			free_vhd_request(s, r);

			ASSERT(tmp.op == VHD_OP_DATA_READ || 
			       tmp.op == VHD_OP_DATA_WRITE);

			if (tmp.op == VHD_OP_DATA_READ)
				vhd_queue_read(s->driver, tmp.treq);
			else if (tmp.op == VHD_OP_DATA_WRITE)
				vhd_queue_write(s->driver, tmp.treq);

			r = next;
		}
	} else {
		int err = req->error;
		unlock_bitmap(bm);
		free_vhd_bitmap(s, bm);
		return signal_completion(r, err);
	}

	if (!bitmap_in_use(bm))
		unlock_bitmap(bm);
}

static void
finish_bitmap_write(struct vhd_request *req)
{
	uint32_t blk;
	struct vhd_bitmap  *bm;
	struct vhd_transaction *tx;
	struct vhd_state *s = req->state;

	s->returned++;
	TRACE(s);

	blk = req->treq.sec / s->spb;
	bm  = get_bitmap(s, blk);
	tx  = &bm->tx;

	DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n",
	    blk, tx->started, tx->finished);
	ASSERT(tx->closed);
	ASSERT(bm && bitmap_valid(bm));
	ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));

	clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);

	finish_bitmap_transaction(s, bm, req->error);
}

static void
finish_data_read(struct vhd_request *req)
{
	struct vhd_state *s = req->state;

	DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", 
	    req->treq.sec, req->treq.sec / s->spb);
	signal_completion(req, 0);
}

static void
finish_data_write(struct vhd_request *req)
{
	int i;
	struct vhd_transaction *tx = req->tx;
	struct vhd_state *s = (struct vhd_state *)req->state;

	set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED);

	if (tx) {
		uint32_t blk, sec;
		struct vhd_bitmap *bm;

		blk = req->treq.sec / s->spb;
		sec = req->treq.sec % s->spb;
		bm  = get_bitmap(s, blk);

		ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));

		tx->finished++;

		DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", "
		    "tx->started: %d, tx->finished: %d\n", req->treq.sec,
		    req->treq.sec / s->spb, tx->started, tx->finished);

		if (!req->error)
			for (i = 0; i < req->treq.secs; i++)
				vhd_bitmap_set(&s->vhd, bm->shadow,  sec + i);

		if (transaction_completed(tx))
			finish_data_transaction(s, bm);

	} else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) {
		ASSERT(!req->next);
		DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", 
		    req->treq.sec, req->treq.sec / s->spb);
		signal_completion(req, 0);
	}
}

void
vhd_complete(void *arg, struct tiocb *tiocb, int err)
{
	struct vhd_request *req = (struct vhd_request *)arg;
	struct vhd_state *s = req->state;
	struct iocb *io = &tiocb->iocb;

	s->completed++;
	TRACE(s);

	req->error = err;

	if (req->error)
		ERR(s, req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, "
		    "nbytes: %lu, blk: %"PRIu64", blk_offset: %u",
		    s->vhd.file, req->op, req->treq.sec, req->treq.secs,
		    io->u.c.nbytes, req->treq.sec / s->spb,
		    bat_entry(s, req->treq.sec / s->spb));

	switch (req->op) {
	case VHD_OP_DATA_READ:
		finish_data_read(req);
		break;

	case VHD_OP_DATA_WRITE:
		finish_data_write(req);
		break;

	case VHD_OP_BITMAP_READ:
		finish_bitmap_read(req);
		break;

	case VHD_OP_BITMAP_WRITE:
		finish_bitmap_write(req);
		break;

	case VHD_OP_ZERO_BM_WRITE:
		finish_zero_bm_write(req);
		break;

	case VHD_OP_REDUNDANT_BM_WRITE:
		finish_redundant_bm_write(req);
		break;

	case VHD_OP_BAT_WRITE:
		finish_bat_write(req);
		break;

	default:
		ASSERT(0);
		break;
	}
}

void 
vhd_debug(td_driver_t *driver)
{
	int i;
	struct vhd_state *s = (struct vhd_state *)driver->data;

	DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", "
	    "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed,
	    s->returned);
	DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n",
	    s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0));
	DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n",
	    s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0));

	DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%u total)\n", VHD_REQS_DATA);
	for (i = 0; i < VHD_REQS_DATA; i++) {
		struct vhd_request *r = &s->vreq_list[i];
		td_request_t *t       = &r->treq;
		const char *vname     = t->vreq ? t->vreq->name: NULL;
		if (t->secs)
			DBG(TLOG_WARN, "%d: vreq: %s.%d, err: %d, op: %d,"
			    " lsec: 0x%08"PRIx64", flags: %d, this: %p, "
			    "next: %p, tx: %p\n", i, vname, t->sidx, r->error, r->op,
			    t->sec, r->flags, r, r->next, r->tx);
	}

	DBG(TLOG_WARN, "BITMAP CACHE:\n");
	for (i = 0; i < VHD_CACHE_SIZE; i++) {
		int qnum = 0, wnum = 0, rnum = 0;
		struct vhd_bitmap *bm = s->bitmap[i];
		struct vhd_transaction *tx;
		struct vhd_request *r;

		if (!bm)
			continue;

		tx = &bm->tx;
		r = bm->queue.head;
		while (r) {
			qnum++;
			r = r->next;
		}

		r = bm->waiting.head;
		while (r) {
			wnum++;
			r = r->next;
		}

		r = tx->requests.head;
		while (r) {
			rnum++;
			r = r->next;
		}

		DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, "
		    "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, "
		    "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n",
		    i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head,
		    wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error,
		    tx->started, tx->finished, tx->status, tx->requests.head, rnum);
	}

	DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, "
	    "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk,
	    s->bat.pbw_offset, s->bat.req.tx);

/*
	for (i = 0; i < s->hdr.max_bat_size; i++)
		DPRINTF("%d: %u\n", i, s->bat.bat[i]);
*/
}

struct tap_disk tapdisk_vhd = {
	.disk_type          = "tapdisk_vhd",
	.flags              = 0,
	.private_data_size  = sizeof(struct vhd_state),
	.td_open            = _vhd_open,
	.td_close           = _vhd_close,
	.td_queue_read      = vhd_queue_read,
	.td_queue_write     = vhd_queue_write,
	.td_get_parent_id   = vhd_get_parent_id,
	.td_validate_parent = vhd_validate_parent,
	.td_debug           = vhd_debug,
};
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Reply via email to