Hello, I want to know about Blktap asynchronous i/o read write operations on VHD. I want to know detail flow of program block-VHD.c . How the read write operations are performed on VHD . And any improvement in code would be done for better read write operations. Description about dynamic disk structure. I wand modify code of fast clone copy for copy on read. Currently data is copy into vhd after write. So tell me about how step by step which functions are calling from block-vhd so fast copy is done.
I have attached the code of block-vhd.c. Thanks, Akash Talole
/* * Copyright (C) Citrix Systems Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; version 2.1 only * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* * block-vhd.c: asynchronous vhd implementation. * * A note on write transactions: * Writes that require updating the BAT or bitmaps cannot be signaled * as complete until all updates have reached disk. Transactions are * used to ensure proper ordering in these cases. The two types of * transactions are as follows: * - Bitmap updates only: data writes that require updates to the same * bitmap are grouped in a transaction. Only after all data writes * in a transaction complete does the bitmap write commence. Only * after the bitmap write finishes are the data writes signalled as * complete. * - BAT and bitmap updates: data writes are grouped in transactions * as above, but a special extra write is included in the transaction, * which zeros out the newly allocated bitmap on disk. When the data * writes and the zero-bitmap write complete, the BAT and bitmap writes * are started in parallel. The transaction is completed only after both * the BAT and bitmap writes successfully return. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include <errno.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <uuid/uuid.h> /* For whatever reason, Linux packages this in */ /* e2fsprogs-devel. */ #include <string.h> /* for memset. */ #include <libaio.h> #include <sys/mman.h> #include <limits.h> #include "debug.h" #include "libvhd.h" #include "tapdisk.h" #include "tapdisk-driver.h" #include "tapdisk-interface.h" #include "tapdisk-disktype.h" #include "tapdisk-storage.h" unsigned int SPB; #define DEBUGGING 2 #define MICROSOFT_COMPAT #define VHD_BATMAP_MAX_RETRIES 10 #define __TRACE(s) \ do { \ DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %" \ PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: " \ "%u, BBLK: 0x%04x\n", \ s->vhd.file, s->queued, s->completed, s->returned, \ VHD_REQS_DATA - s->vreq_free_count, \ s->bat.pbw_blk); \ } while(0) #if (DEBUGGING == 1) #define DBG(level, _f, _a...) DPRINTF(_f, ##_a) #define ERR(_s, err, _f, _a...) DPRINTF("ERROR: %d: " _f, err, ##_a) #define TRACE(s) ((void)0) #elif (DEBUGGING == 2) #define DBG(level, _f, _a...) tlog_write(level, _f, ##_a) #define ERR(_s, _err, _f, _a...) tlog_drv_error((_s)->driver, _err, _f, ##_a) #define TRACE(s) __TRACE(s) #else #define DBG(level, _f, _a...) ((void)0) #define ERR(_s, err, _f, _a...) ((void)0) #define TRACE(s) ((void)0) #endif /******VHD DEFINES******/ #define VHD_CACHE_SIZE 32 #define VHD_REQS_DATA TAPDISK_DATA_REQUESTS #define VHD_REQS_META (VHD_CACHE_SIZE + 2) #define VHD_REQS_TOTAL (VHD_REQS_DATA + VHD_REQS_META) #define VHD_OP_BAT_WRITE 0 #define VHD_OP_DATA_READ 1 #define VHD_OP_DATA_WRITE 2 #define VHD_OP_BITMAP_READ 3 #define VHD_OP_BITMAP_WRITE 4 #define VHD_OP_ZERO_BM_WRITE 5 #define VHD_OP_REDUNDANT_BM_WRITE 6 #define VHD_BM_BAT_LOCKED 0 #define VHD_BM_BAT_CLEAR 1 #define VHD_BM_BIT_CLEAR 2 #define VHD_BM_BIT_SET 3 #define VHD_BM_NOT_CACHED 4 #define VHD_BM_READ_PENDING 5 #define VHD_FLAG_OPEN_RDONLY 1 #define VHD_FLAG_OPEN_NO_CACHE 2 #define VHD_FLAG_OPEN_QUIET 4 #define VHD_FLAG_OPEN_STRICT 8 #define VHD_FLAG_OPEN_QUERY 16 #define VHD_FLAG_OPEN_PREALLOCATE 32 #define VHD_FLAG_OPEN_NO_O_DIRECT 64 #define VHD_FLAG_OPEN_LOCAL_CACHE 128 #define VHD_FLAG_BAT_LOCKED 1 #define VHD_FLAG_BAT_WRITE_STARTED 2 #define VHD_FLAG_BM_UPDATE_BAT 1 #define VHD_FLAG_BM_WRITE_PENDING 2 #define VHD_FLAG_BM_READ_PENDING 4 #define VHD_FLAG_BM_LOCKED 8 #define VHD_FLAG_REQ_UPDATE_BAT 1 #define VHD_FLAG_REQ_UPDATE_BITMAP 2 #define VHD_FLAG_REQ_QUEUED 4 #define VHD_FLAG_REQ_FINISHED 8 #define VHD_FLAG_TX_LIVE 1 #define VHD_FLAG_TX_UPDATE_BAT 2 typedef uint8_t vhd_flag_t; struct vhd_state; struct vhd_request; struct vhd_req_list { struct vhd_request *head; struct vhd_request *tail; }; struct vhd_transaction { int error; int closed; int started; int finished; vhd_flag_t status; struct vhd_req_list requests; }; struct vhd_request { int error; uint8_t op; vhd_flag_t flags; td_request_t treq; struct tiocb tiocb; struct vhd_state *state; struct vhd_request *next; struct vhd_transaction *tx; }; struct vhd_bat_state { vhd_bat_t bat; vhd_batmap_t batmap; vhd_flag_t status; uint32_t pbw_blk; /* blk num of pending write */ uint64_t pbw_offset; /* file offset of same */ struct vhd_request req; /* for writing bat table */ struct vhd_request zero_req; /* for initializing bitmaps */ char *bat_buf; }; struct vhd_bitmap { uint32_t blk; uint64_t seqno; /* lru sequence number */ vhd_flag_t status; char *map; /* map should only be modified * in finish_bitmap_write */ char *shadow; /* in-memory bitmap changes are * made to shadow and copied to * map only after having been * flushed to disk */ struct vhd_transaction tx; /* transaction data structure * encapsulating data, bitmap, * and bat writes */ struct vhd_req_list queue; /* data writes waiting for next * transaction */ struct vhd_req_list waiting; /* pending requests that cannot * be serviced until this bitmap * is read from disk */ struct vhd_request req; }; struct vhd_state { vhd_flag_t flags; /* VHD stuff */ vhd_context_t vhd; uint32_t spp; /* sectors per page */ uint32_t spb; /* sectors per block */ uint64_t first_db; /* pointer to datablock 0 */ /** * Pointer to the next (unallocated) datablock. If greater than UINT_MAX, * there are no more blocks available. */ uint64_t next_db; struct vhd_bat_state bat; uint64_t bm_lru; /* lru sequence number */ uint32_t bm_secs; /* size of bitmap, in sectors */ struct vhd_bitmap *bitmap[VHD_CACHE_SIZE]; int bm_free_count; struct vhd_bitmap *bitmap_free[VHD_CACHE_SIZE]; struct vhd_bitmap bitmap_list[VHD_CACHE_SIZE]; int vreq_free_count; struct vhd_request *vreq_free[VHD_REQS_DATA]; struct vhd_request vreq_list[VHD_REQS_DATA]; /* for redundant bitmap writes */ int padbm_size; char *padbm_buf; long int debug_skipped_redundant_writes; long int debug_done_redundant_writes; td_driver_t *driver; uint64_t queued; uint64_t completed; uint64_t returned; uint64_t reads; uint64_t read_size; uint64_t writes; uint64_t write_size; }; #define test_vhd_flag(word, flag) ((word) & (flag)) #define set_vhd_flag(word, flag) ((word) |= (flag)) #define clear_vhd_flag(word, flag) ((word) &= ~(flag)) #define bat_entry(s, blk) ((s)->bat.bat.bat[(blk)]) static void vhd_complete(void *, struct tiocb *, int); static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *); static struct vhd_state *_vhd_master; static unsigned long _vhd_zsize; static char *_vhd_zeros = NULL; int _dev_zero = -1; static int vhd_initialize(struct vhd_state *s) { int err; if (_vhd_zeros) return 0; _vhd_zsize = 2 * getpagesize(); if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) _vhd_zsize += VHD_BLOCK_SIZE; _dev_zero = open("/dev/zero", O_RDONLY); if (unlikely(_dev_zero == -1)) { err = errno; EPRINTF("failed to open /dev/zero: %s\n", strerror(err)); return -err; } _vhd_zeros = mmap(NULL, _vhd_zsize, PROT_READ, MAP_SHARED, _dev_zero, 0); if (_vhd_zeros == MAP_FAILED) { int _err; err = errno; EPRINTF("vhd_initialize failed: %s\n", strerror(err)); _vhd_zeros = NULL; _vhd_zsize = 0; _err = close(_dev_zero); if (unlikely(_err == -1)) EPRINTF("failed to close /dev/zero: %s (error ignored)\n", strerror(errno)); else _dev_zero = -1; return -err; } _vhd_master = s; return 0; } static void vhd_free(struct vhd_state *s) { if (_vhd_master != s || !_vhd_zeros) return; free(s->padbm_buf); munmap(_vhd_zeros, _vhd_zsize); _vhd_zsize = 0; _vhd_zeros = NULL; _vhd_master = NULL; if (_dev_zero != -1) { int _err = close(_dev_zero); if (unlikely(_err == -1)) EPRINTF("failed to close /dev/zero: %s (error ignored)\n", strerror(errno)); else _dev_zero = -1; } } static char * _get_vhd_zeros(const char *func, unsigned long size) { if (!_vhd_zeros || _vhd_zsize < size) { EPRINTF("invalid zero request from %s: %lu, %lu, %p\n", func, size, _vhd_zsize, _vhd_zeros); ASSERT(0); } return _vhd_zeros; } #define vhd_zeros(size) _get_vhd_zeros(__func__, size) static inline void set_batmap(struct vhd_state *s, uint32_t blk) { if (s->bat.batmap.map) { vhd_batmap_set(&s->vhd, &s->bat.batmap, blk); DBG(TLOG_DBG, "block 0x%x completely full\n", blk); } } static inline int test_batmap(struct vhd_state *s, uint32_t blk) { if (!s->bat.batmap.map) return 0; return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk); } static int vhd_kill_footer(struct vhd_state *s) { int err; off64_t end; void *zeros; if (s->vhd.footer.type == HD_TYPE_FIXED) return 0; err = posix_memalign(&zeros, 512, 512); if (err) return -err; err = 1; memset(zeros, 0xc7c7c7c7, 512); if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1) goto fail; if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1) goto fail; if (write(s->vhd.fd, zeros, 512) != 512) goto fail; err = 0; fail: free(zeros); if (err) return (errno ? -errno : -EIO); return 0; } static inline int find_next_free_block(struct vhd_state *s) { int err; off64_t eom; uint32_t i, entry; err = vhd_end_of_headers(&s->vhd, &eom); if (err) return err; s->next_db = secs_round_up(eom); s->first_db = s->next_db; if ((s->first_db + s->bm_secs) % s->spp) s->first_db += (s->spp - ((s->first_db + s->bm_secs) % s->spp)); for (i = 0; i < s->bat.bat.entries; i++) { entry = bat_entry(s, i); if (entry != DD_BLK_UNUSED && entry >= s->next_db) s->next_db = (uint64_t)entry + (uint64_t)s->spb + (uint64_t)s->bm_secs; if (s->next_db > UINT_MAX) break; } return 0; } static void vhd_free_bat(struct vhd_state *s) { free(s->bat.bat.bat); free(s->bat.batmap.map); free(s->bat.bat_buf); memset(&s->bat, 0, sizeof(struct vhd_bat)); } static int vhd_initialize_bat(struct vhd_state *s) { int err, batmap_required, i; void *buf; memset(&s->bat, 0, sizeof(struct vhd_bat)); err = vhd_read_bat(&s->vhd, &s->bat.bat); if (err) { EPRINTF("%s: reading bat: %d\n", s->vhd.file, err); return err; } batmap_required = 1; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) { batmap_required = 0; } else { err = find_next_free_block(s); if (err) goto fail; } if (vhd_has_batmap(&s->vhd)) { for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) { err = vhd_read_batmap(&s->vhd, &s->bat.batmap); if (err) { EPRINTF("%s: reading batmap: %d\n", s->vhd.file, err); if (batmap_required) goto fail; } else { break; } } if (err) EPRINTF("%s: ignoring non-critical batmap error\n", s->vhd.file); } err = posix_memalign(&buf, VHD_SECTOR_SIZE, VHD_SECTOR_SIZE); if (err) goto fail; s->bat.bat_buf = buf; return 0; fail: vhd_free_bat(s); return err; } static void vhd_free_bitmap_cache(struct vhd_state *s) { int i; struct vhd_bitmap *bm; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap_list + i; free(bm->map); free(bm->shadow); s->bitmap_free[i] = NULL; } memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE); } static int vhd_initialize_bitmap_cache(struct vhd_state *s) { int i, err, map_size; struct vhd_bitmap *bm; void *map, *shadow; memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE); s->bm_lru = 0; map_size = vhd_sectors_to_bytes(s->bm_secs); s->bm_free_count = VHD_CACHE_SIZE; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap_list + i; err = posix_memalign(&map, 512, map_size); if (err) goto fail; bm->map = map; err = posix_memalign(&shadow, 512, map_size); if (err) goto fail; bm->shadow = shadow; memset(bm->map, 0, map_size); memset(bm->shadow, 0, map_size); s->bitmap_free[i] = bm; } return 0; fail: vhd_free_bitmap_cache(s); return err; } static int vhd_initialize_dynamic_disk(struct vhd_state *s) { uint32_t bm_size; void *buf; int err; err = vhd_get_header(&s->vhd); if (err) { if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) EPRINTF("Error reading VHD DD header.\n"); return err; } if (s->vhd.header.hdr_ver != 0x00010000) { EPRINTF("unsupported header version! (0x%x)\n", s->vhd.header.hdr_ver); return -EINVAL; } s->spp = getpagesize() >> VHD_SECTOR_SHIFT; s->spb = s->vhd.header.block_size >> VHD_SECTOR_SHIFT; s->bm_secs = secs_round_up_no_zero(s->spb >> 3); s->padbm_size = (s->bm_secs / getpagesize()) * getpagesize(); if (s->bm_secs % getpagesize()) s->padbm_size += getpagesize(); err = posix_memalign(&buf, 512, s->padbm_size); if (err) return -err; s->padbm_buf = buf; bm_size = s->bm_secs << VHD_SECTOR_SHIFT; memset(s->padbm_buf, 0, s->padbm_size - bm_size); memset(s->padbm_buf + (s->padbm_size - bm_size), ~0, bm_size); s->debug_skipped_redundant_writes = 0; s->debug_done_redundant_writes = 0; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE)) return 0; err = vhd_initialize_bat(s); if (err) return err; err = vhd_initialize_bitmap_cache(s); if (err) { vhd_free_bat(s); return err; } return 0; } static int vhd_check_version(struct vhd_state *s) { if (strncmp(s->vhd.footer.crtr_app, "tap", 3)) return 0; if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) { if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) EPRINTF("WARNING: %s vhd creator version 0x%08x, " "but only versions up to 0x%08x are " "supported for IO\n", s->vhd.file, s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION); return -EINVAL; } return 0; } static void vhd_log_open(struct vhd_state *s) { char buf[5]; uint32_t i, allocated, full; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) return; snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app); if (!vhd_type_dynamic(&s->vhd)) { DPRINTF("%s version: %s 0x%08x\n", s->vhd.file, buf, s->vhd.footer.crtr_ver); return; } allocated = 0; full = 0; for (i = 0; i < s->bat.bat.entries; i++) { if (bat_entry(s, i) != DD_BLK_UNUSED) allocated++; if (test_batmap(s, i)) full++; } DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n", s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries, allocated, full, s->next_db); } static int __vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags) { int i, o_flags, err; struct vhd_state *s; DBG(TLOG_INFO, "vhd_open: %s\n", name); if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT)) libvhd_set_log_level(1); s = (struct vhd_state *)driver->data; memset(s, 0, sizeof(struct vhd_state)); s->flags = flags; s->driver = driver; err = vhd_initialize(s); if (err) return err; o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ? VHD_OPEN_RDONLY : VHD_OPEN_RDWR); if ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY) || test_vhd_flag(flags, VHD_FLAG_OPEN_LOCAL_CACHE)) && test_vhd_flag(flags, VHD_FLAG_OPEN_NO_O_DIRECT)) set_vhd_flag(o_flags, VHD_OPEN_CACHED); if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT)) set_vhd_flag(o_flags, VHD_OPEN_STRICT); err = vhd_open(&s->vhd, name, o_flags); if (err) { libvhd_set_log_level(1); err = vhd_open(&s->vhd, name, o_flags); if (err) { EPRINTF("Unable to open [%s] (%d)!\n", name, err); return err; } } err = vhd_check_version(s); if (err) goto fail; s->spb = s->spp = 1; if (vhd_type_dynamic(&s->vhd)) { err = vhd_initialize_dynamic_disk(s); if (err) goto fail; } vhd_log_open(s); SPB = s->spb; s->vreq_free_count = VHD_REQS_DATA; for (i = 0; i < VHD_REQS_DATA; i++) s->vreq_free[i] = s->vreq_list + i; driver->info.size = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT; driver->info.sector_size = VHD_SECTOR_SIZE; driver->info.info = 0; DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%lu, inf:%u)\n", driver->info.size, driver->info.sector_size, driver->info.info); if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) && !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) { err = vhd_kill_footer(s); if (err) { DPRINTF("ERROR killing footer: %d\n", err); goto fail; } s->writes++; } return 0; fail: vhd_free_bat(s); vhd_free_bitmap_cache(s); vhd_close(&s->vhd); vhd_free(s); return err; } static int _vhd_open(td_driver_t *driver, const char *name, td_flag_t flags) { vhd_flag_t vhd_flags = 0; if (flags & TD_OPEN_RDONLY) vhd_flags |= VHD_FLAG_OPEN_RDONLY; if (flags & TD_OPEN_NO_O_DIRECT) vhd_flags |= VHD_FLAG_OPEN_NO_O_DIRECT; if (flags & TD_OPEN_QUIET) vhd_flags |= VHD_FLAG_OPEN_QUIET; if (flags & TD_OPEN_STRICT) vhd_flags |= VHD_FLAG_OPEN_STRICT; if (flags & TD_OPEN_QUERY) vhd_flags |= (VHD_FLAG_OPEN_QUERY | VHD_FLAG_OPEN_QUIET | VHD_FLAG_OPEN_RDONLY | VHD_FLAG_OPEN_NO_CACHE); if (flags & TD_OPEN_LOCAL_CACHE) vhd_flags |= VHD_FLAG_OPEN_LOCAL_CACHE; /* pre-allocate for all but NFS and LVM storage */ driver->storage = tapdisk_storage_type(name); if (driver->storage != TAPDISK_STORAGE_TYPE_NFS && driver->storage != TAPDISK_STORAGE_TYPE_LVM) vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE; return __vhd_open(driver, name, vhd_flags); } static void vhd_log_close(struct vhd_state *s) { uint32_t i, allocated, full; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) return; allocated = 0; full = 0; for (i = 0; i < s->bat.bat.entries; i++) { if (bat_entry(s, i) != DD_BLK_UNUSED) allocated++; if (test_batmap(s, i)) full++; } DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n", s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db); } static int _vhd_close(td_driver_t *driver) { int err; struct vhd_state *s; DBG(TLOG_WARN, "vhd_close\n"); s = (struct vhd_state *)driver->data; DPRINTF("gaps written/skipped: %ld/%ld\n", s->debug_done_redundant_writes, s->debug_skipped_redundant_writes); /* don't write footer if tapdisk is read-only */ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) goto free; /* * write footer if: * - we killed it on open (opened with strict) * - we've written data since opening */ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) { memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t)); err = vhd_write_footer(&s->vhd, &s->vhd.footer); memset(&s->vhd.bat, 0, sizeof(vhd_bat_t)); if (err) EPRINTF("writing %s footer: %d\n", s->vhd.file, err); if (!vhd_has_batmap(&s->vhd)) goto free; err = vhd_write_batmap(&s->vhd, &s->bat.batmap); if (err) EPRINTF("writing %s batmap: %d\n", s->vhd.file, err); } free: vhd_log_close(s); vhd_free_bat(s); vhd_free_bitmap_cache(s); vhd_close(&s->vhd); vhd_free(s); memset(s, 0, sizeof(struct vhd_state)); return 0; } int vhd_validate_parent(td_driver_t *child_driver, td_driver_t *parent_driver, td_flag_t flags) { struct vhd_state *child = (struct vhd_state *)child_driver->data; struct vhd_state *parent; if (parent_driver->type != DISK_TYPE_VHD) { if (child_driver->type != DISK_TYPE_VHD) return -EINVAL; if (child->vhd.footer.type != HD_TYPE_DIFF) return -EINVAL; if (!vhd_parent_raw(&child->vhd)) return -EINVAL; return 0; } parent = (struct vhd_state *)parent_driver->data; /* * This check removed because of cases like: * - parent VHD marked as 'hidden' * - parent VHD modified during coalesce */ /* if (stat(parent->vhd.file, &stats)) { DPRINTF("ERROR stating parent file %s\n", parent->vhd.file); return -errno; } if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) { DPRINTF("ERROR: parent file has been modified since " "snapshot. Child image no longer valid.\n"); return -EINVAL; } */ if (uuid_compare(child->vhd.header.prt_uuid, parent->vhd.footer.uuid)) { DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since " "snapshot. Child image no longer valid.\n", __func__, child->vhd.file, parent->vhd.file); return -EINVAL; } /* TODO: compare sizes */ return 0; } int vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id) { int err; char *parent; struct vhd_state *s; int flags; DBG(TLOG_DBG, "\n"); flags = id->flags; memset(id, 0, sizeof(td_disk_id_t)); s = (struct vhd_state *)driver->data; if (s->vhd.footer.type != HD_TYPE_DIFF) return TD_NO_PARENT; err = vhd_parent_locator_get(&s->vhd, &parent); if (err) return err; id->name = parent; id->type = vhd_parent_raw(&s->vhd) ? DISK_TYPE_AIO : DISK_TYPE_VHD; id->flags = flags|TD_OPEN_SHAREABLE|TD_OPEN_RDONLY; return 0; } static inline void clear_req_list(struct vhd_req_list *list) { list->head = list->tail = NULL; } static inline void add_to_tail(struct vhd_req_list *list, struct vhd_request *e) { if (!list->head) list->head = list->tail = e; else list->tail = list->tail->next = e; } static inline int remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e) { struct vhd_request *i = list->head; if (list->head == e) { if (list->tail == e) clear_req_list(list); else list->head = list->head->next; return 0; } while (i->next) { if (i->next == e) { if (list->tail == e) { i->next = NULL; list->tail = i; } else i->next = i->next->next; return 0; } i = i->next; } return -EINVAL; } static inline void init_vhd_request(struct vhd_state *s, struct vhd_request *req) { memset(req, 0, sizeof(struct vhd_request)); req->state = s; } static inline void init_tx(struct vhd_transaction *tx) { memset(tx, 0, sizeof(struct vhd_transaction)); } static inline void add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r) { ASSERT(!tx->closed); r->tx = tx; tx->started++; add_to_tail(&tx->requests, r); set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE); DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, " "started: %d, finished: %d, status: %u\n", r->treq.sec / SPB, r->treq.sec, tx, tx->started, tx->finished, tx->status); } static inline int transaction_completed(struct vhd_transaction *tx) { return (tx->started == tx->finished); } static inline void init_bat(struct vhd_state *s) { s->bat.req.tx = NULL; s->bat.req.next = NULL; s->bat.req.error = 0; s->bat.pbw_blk = 0; s->bat.pbw_offset = 0; s->bat.status = 0; } static inline void lock_bat(struct vhd_state *s) { set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); } static inline void unlock_bat(struct vhd_state *s) { clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); } static inline int bat_locked(struct vhd_state *s) { return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); } static inline void init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) { bm->blk = 0; bm->seqno = 0; bm->status = 0; init_tx(&bm->tx); clear_req_list(&bm->queue); clear_req_list(&bm->waiting); memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs)); memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs)); init_vhd_request(s, &bm->req); } static inline struct vhd_bitmap * get_bitmap(struct vhd_state *s, uint32_t block) { int i; struct vhd_bitmap *bm; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap[i]; if (bm && bm->blk == block) return bm; } return NULL; } static inline void lock_bitmap(struct vhd_bitmap *bm) { set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); } static inline void unlock_bitmap(struct vhd_bitmap *bm) { clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); } static inline int bitmap_locked(struct vhd_bitmap *bm) { return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); } static inline int bitmap_valid(struct vhd_bitmap *bm) { return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); } static inline int bitmap_in_use(struct vhd_bitmap *bm) { return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING) || test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) || test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) || bm->waiting.head || bm->tx.requests.head || bm->queue.head); } static inline int bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm) { int i, n; n = s->spb >> 3; for (i = 0; i < n; i++) if (bm->map[i] != (char)0xFF) return 0; DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk); return 1; } static struct vhd_bitmap * remove_lru_bitmap(struct vhd_state *s) { int i, idx = 0; uint64_t seq = s->bm_lru; struct vhd_bitmap *bm, *lru = NULL; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap[i]; if (bm && bm->seqno < seq && !bitmap_locked(bm)) { idx = i; lru = bm; seq = lru->seqno; } } if (lru) { s->bitmap[idx] = NULL; ASSERT(!bitmap_in_use(lru)); } return lru; } static int alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk) { struct vhd_bitmap *bm; *bitmap = NULL; if (s->bm_free_count > 0) { bm = s->bitmap_free[--s->bm_free_count]; } else { bm = remove_lru_bitmap(s); if (!bm) return -EBUSY; } init_vhd_bitmap(s, bm); bm->blk = blk; *bitmap = bm; return 0; } static inline uint64_t __bitmap_lru_seqno(struct vhd_state *s) { int i; struct vhd_bitmap *bm; if (s->bm_lru == 0xffffffff) { s->bm_lru = 0; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap[i]; if (bm) { bm->seqno >>= 1; if (bm->seqno > s->bm_lru) s->bm_lru = bm->seqno; } } } return ++s->bm_lru; } static inline void touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) { bm->seqno = __bitmap_lru_seqno(s); } static inline void install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) { int i; for (i = 0; i < VHD_CACHE_SIZE; i++) { if (!s->bitmap[i]) { touch_bitmap(s, bm); s->bitmap[i] = bm; return; } } ASSERT(0); } static inline void free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) { int i; for (i = 0; i < VHD_CACHE_SIZE; i++) if (s->bitmap[i] == bm) break; ASSERT(!bitmap_locked(bm)); ASSERT(!bitmap_in_use(bm)); ASSERT(i < VHD_CACHE_SIZE); s->bitmap[i] = NULL; s->bitmap_free[s->bm_free_count++] = bm; } static int read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op) { uint32_t blk, sec; struct vhd_bitmap *bm; /* in fixed disks, every block is present */ if (s->vhd.footer.type == HD_TYPE_FIXED) return VHD_BM_BIT_SET; /* the extent the logical sector falls in */ blk = sector / s->spb; /* offset within the extent the logical sector is located */ sec = sector % s->spb; if (blk > s->vhd.header.max_bat_size) { DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n", sector, op); return -EINVAL; } if (bat_entry(s, blk) == DD_BLK_UNUSED) { if (op == VHD_OP_DATA_WRITE && s->bat.pbw_blk != blk && bat_locked(s)) return VHD_BM_BAT_LOCKED; return VHD_BM_BAT_CLEAR; } if (test_batmap(s, blk)) { DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk); return VHD_BM_BIT_SET; } bm = get_bitmap(s, blk); if (!bm) return VHD_BM_NOT_CACHED; /* bump lru count */ touch_bitmap(s, bm); if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)) return VHD_BM_READ_PENDING; return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ? VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR); } static int read_bitmap_cache_span(struct vhd_state *s, uint64_t sector, int nr_secs, int value) { int ret; uint32_t blk, sec; struct vhd_bitmap *bm; /* in fixed disks, every block is present */ if (s->vhd.footer.type == HD_TYPE_FIXED) return nr_secs; sec = sector % s->spb; blk = sector / s->spb; if (test_batmap(s, blk)) return MIN(nr_secs, s->spb - sec); bm = get_bitmap(s, blk); ASSERT(bm && bitmap_valid(bm)); for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++) if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value) break; return ret; } static inline struct vhd_request * alloc_vhd_request(struct vhd_state *s) { struct vhd_request *req = NULL; if (s->vreq_free_count > 0) { req = s->vreq_free[--s->vreq_free_count]; ASSERT(req->treq.secs == 0); init_vhd_request(s, req); return req; } return NULL; } static inline void free_vhd_request(struct vhd_state *s, struct vhd_request *req) { memset(req, 0, sizeof(struct vhd_request)); s->vreq_free[s->vreq_free_count++] = req; } static inline void aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset) { struct tiocb *tiocb = &req->tiocb; td_prep_read(tiocb, s->vhd.fd, req->treq.buf, vhd_sectors_to_bytes(req->treq.secs), offset, vhd_complete, req); td_queue_tiocb(s->driver, tiocb); s->queued++; s->reads++; s->read_size += req->treq.secs; TRACE(s); } static inline void aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset) { struct tiocb *tiocb = &req->tiocb; td_prep_write(tiocb, s->vhd.fd, req->treq.buf, vhd_sectors_to_bytes(req->treq.secs), offset, vhd_complete, req); td_queue_tiocb(s->driver, tiocb); s->queued++; s->writes++; s->write_size += req->treq.secs; TRACE(s); } /** * Reserves a new extent. * * @returns a 64-bit unsigned integer where the error code is stored in the * upper 32 bits and the reserved block number is stored in the lower 32 bits. * If an error is returned (the upper 32 bits are not zero), the lower 32 bits * are undefined. */ static inline uint64_t reserve_new_block(struct vhd_state *s, uint32_t blk) { int gap = 0; ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED)); /* data region of segment should begin on page boundary */ if ((s->next_db + s->bm_secs) % s->spp) gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp)); if (s->next_db + gap > UINT_MAX) return (uint64_t)ENOSPC << 32; s->bat.pbw_blk = blk; s->bat.pbw_offset = s->next_db + gap; return s->next_db; } static int schedule_bat_write(struct vhd_state *s) { int i; uint32_t blk; char *buf; uint64_t offset; struct vhd_request *req; ASSERT(bat_locked(s)); req = &s->bat.req; buf = s->bat.bat_buf; blk = s->bat.pbw_blk; init_vhd_request(s, req); memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512); ((uint32_t *)buf)[blk % 128] = s->bat.pbw_offset; for (i = 0; i < 128; i++) BE32_OUT(&((uint32_t *)buf)[i]); offset = s->vhd.header.table_offset + (blk - (blk % 128)) * 4; req->treq.secs = 1; req->treq.buf = buf; req->op = VHD_OP_BAT_WRITE; req->next = NULL; aio_write(s, req, offset); set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED); DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", " "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset); return 0; } static void schedule_zero_bm_write(struct vhd_state *s, struct vhd_bitmap *bm, uint64_t lb_end) { uint64_t offset; struct vhd_request *req = &s->bat.zero_req; init_vhd_request(s, req); offset = vhd_sectors_to_bytes(lb_end); req->op = VHD_OP_ZERO_BM_WRITE; req->treq.sec = s->bat.pbw_blk * s->spb; req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs; req->treq.buf = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs)); req->next = NULL; DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n", s->bat.pbw_blk, offset); lock_bitmap(bm); add_to_transaction(&bm->tx, req); aio_write(s, req, offset); } /* This is a performance optimization. When writing sequentially into full * blocks, skipping (up-to-date) bitmaps causes an approx. 25% reduction in * throughput. To prevent skipping, we issue redundant writes into the (padded) * bitmap area just to make all writes sequential. This will help VHDs on raw * block devices, while the FS-based VHDs shouldn't suffer much. * * Note that it only makes sense to perform this reduntant bitmap write if the * block is completely full (i.e. the batmap entry is set). If the block is not * completely full then one of the following two things will be true: * 1. we'll either be allocating new sectors in this block and writing its * bitmap transactionally, which will be slow anyways; or * 2. the IO will be skipping over the unallocated sectors again, so the * pattern will not be sequential anyways * In either case a redundant bitmap write becomes pointless. This fact * simplifies the implementation of redundant writes: since we know the bitmap * cannot be updated by anyone else, we don't have to worry about transactions * or potential write conflicts. * */ static void schedule_redundant_bm_write(struct vhd_state *s, uint32_t blk) { uint64_t offset; struct vhd_request *req; ASSERT(s->vhd.footer.type != HD_TYPE_FIXED); ASSERT(test_batmap(s, blk)); req = alloc_vhd_request(s); if (!req) return; req->treq.buf = s->padbm_buf; offset = bat_entry(s, blk); ASSERT(offset != DD_BLK_UNUSED); offset <<= VHD_SECTOR_SHIFT; offset -= s->padbm_size - (s->bm_secs << VHD_SECTOR_SHIFT); req->op = VHD_OP_REDUNDANT_BM_WRITE; req->treq.sec = blk * s->spb; req->treq.secs = s->padbm_size >> VHD_SECTOR_SHIFT; req->next = NULL; DBG(TLOG_DBG, "blk: %u, writing redundant bitmap at %" PRIu64 "\n", blk, offset); aio_write(s, req, offset); } static int update_bat(struct vhd_state *s, uint32_t blk) { int err; uint64_t lb_end; struct vhd_bitmap *bm; ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED); if (bat_locked(s)) { ASSERT(s->bat.pbw_blk == blk); return 0; } /* empty bitmap could already be in * cache if earlier bat update failed */ bm = get_bitmap(s, blk); if (!bm) { /* install empty bitmap in cache */ err = alloc_vhd_bitmap(s, &bm, blk); if (err) return err; install_bitmap(s, bm); } lock_bat(s); lb_end = reserve_new_block(s, blk); if (lb_end >> 32) { unlock_bat(s); return -(lb_end >> 32); } schedule_zero_bm_write(s, bm, lb_end); set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT); return 0; } static int allocate_block(struct vhd_state *s, uint32_t blk) { int err, gap; uint64_t offset, size; struct vhd_bitmap *bm; ssize_t count; uint64_t next_db; ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED); if (bat_locked(s)) { ASSERT(s->bat.pbw_blk == blk); if (s->bat.req.error) return -EBUSY; return 0; } gap = 0; offset = vhd_sectors_to_bytes(s->next_db); next_db = s->next_db; /* data region of segment should begin on page boundary */ if ((next_db + s->bm_secs) % s->spp) { gap = (s->spp - ((next_db + s->bm_secs) % s->spp)); next_db += gap; } if (next_db > UINT_MAX) return -ENOSPC; s->next_db = next_db; s->bat.pbw_blk = blk; s->bat.pbw_offset = s->next_db; DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset); if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) { ERR(s, -errno, "lseek failed\n"); return -errno; } size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap); count = write(s->vhd.fd, vhd_zeros(size), size); if (count != size) { err = count < 0 ? -errno : -ENOSPC; ERR(s, -errno, "write failed (%zd, offset %"PRIu64")\n", count, offset); return err; } /* empty bitmap could already be in * cache if earlier bat update failed */ bm = get_bitmap(s, blk); if (!bm) { /* install empty bitmap in cache */ err = alloc_vhd_bitmap(s, &bm, blk); if (err) return err; install_bitmap(s, bm); } lock_bat(s); lock_bitmap(bm); schedule_bat_write(s); add_to_transaction(&bm->tx, &s->bat.req); return 0; } static int schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags) { uint64_t offset; uint32_t blk = 0, sec = 0; struct vhd_bitmap *bm; struct vhd_request *req; if (s->vhd.footer.type == HD_TYPE_FIXED) { offset = vhd_sectors_to_bytes(treq.sec); goto make_request; } blk = treq.sec / s->spb; sec = treq.sec % s->spb; bm = get_bitmap(s, blk); offset = bat_entry(s, blk); ASSERT(offset != DD_BLK_UNUSED); ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm))); offset += s->bm_secs + sec; offset = vhd_sectors_to_bytes(offset); make_request: req = alloc_vhd_request(s); if (!req) return -EBUSY; req->treq = treq; req->flags = flags; req->op = VHD_OP_DATA_READ; req->next = NULL; aio_read(s, req, offset); DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, " "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n", s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags, treq.buf); return 0; } static int schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags) { int err; uint64_t offset; uint32_t blk = 0, sec = 0; struct vhd_bitmap *bm = NULL; struct vhd_request *req; if (s->vhd.footer.type == HD_TYPE_FIXED) { offset = vhd_sectors_to_bytes(treq.sec); goto make_request; } blk = treq.sec / s->spb; sec = treq.sec % s->spb; offset = bat_entry(s, blk); if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) { if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) err = allocate_block(s, blk); else err = update_bat(s, blk); if (err) return err; offset = s->bat.pbw_offset; } offset += s->bm_secs + sec; offset = vhd_sectors_to_bytes(offset); make_request: req = alloc_vhd_request(s); if (!req) return -EBUSY; req->treq = treq; req->flags = flags; req->op = VHD_OP_DATA_WRITE; req->next = NULL; if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) { bm = get_bitmap(s, blk); ASSERT(bm && bitmap_valid(bm)); lock_bitmap(bm); if (bm->tx.closed) { add_to_tail(&bm->queue, req); set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED); } else add_to_transaction(&bm->tx, req); } else if (sec == 0 && /* first sector inside data block */ s->vhd.footer.type != HD_TYPE_FIXED && bat_entry(s, blk) != s->first_db && test_batmap(s, blk)) schedule_redundant_bm_write(s, blk); aio_write(s, req, offset); DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, " "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n", s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags); return 0; } static int schedule_bitmap_read(struct vhd_state *s, uint32_t blk) { int err; uint64_t offset; struct vhd_bitmap *bm; struct vhd_request *req = NULL; ASSERT(vhd_type_dynamic(&s->vhd)); offset = bat_entry(s, blk); ASSERT(offset != DD_BLK_UNUSED); ASSERT(!get_bitmap(s, blk)); offset = vhd_sectors_to_bytes(offset); err = alloc_vhd_bitmap(s, &bm, blk); if (err) return err; req = &bm->req; init_vhd_request(s, req); req->treq.sec = blk * s->spb; req->treq.secs = s->bm_secs; req->treq.buf = bm->map; req->treq.cb = NULL; req->op = VHD_OP_BITMAP_READ; req->next = NULL; aio_read(s, req, offset); lock_bitmap(bm); install_bitmap(s, bm); set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, " "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk, req->treq.secs, offset); return 0; } static void schedule_bitmap_write(struct vhd_state *s, uint32_t blk) { uint64_t offset; struct vhd_bitmap *bm; struct vhd_request *req; bm = get_bitmap(s, blk); offset = bat_entry(s, blk); ASSERT(vhd_type_dynamic(&s->vhd)); ASSERT(bm && bitmap_valid(bm) && !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING)); if (offset == DD_BLK_UNUSED) { ASSERT(bat_locked(s) && s->bat.pbw_blk == blk); offset = s->bat.pbw_offset; } offset = vhd_sectors_to_bytes(offset); req = &bm->req; init_vhd_request(s, req); req->treq.sec = blk * s->spb; req->treq.secs = s->bm_secs; req->treq.buf = bm->shadow; req->treq.cb = NULL; req->op = VHD_OP_BITMAP_WRITE; req->next = NULL; aio_write(s, req, offset); lock_bitmap(bm); touch_bitmap(s, bm); /* bump lru count */ set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING); DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, " "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec, req->treq.secs, offset); } /* * queued requests will be submitted once the bitmap * describing them is read and the requests are validated. */ static int __vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq) { uint32_t blk; struct vhd_bitmap *bm; struct vhd_request *req; ASSERT(vhd_type_dynamic(&s->vhd)); blk = treq.sec / s->spb; bm = get_bitmap(s, blk); ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)); req = alloc_vhd_request(s); if (!req) return -EBUSY; req->treq = treq; req->op = op; req->next = NULL; add_to_tail(&bm->waiting, req); lock_bitmap(bm); DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, " "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op); TRACE(s); return 0; } static void vhd_queue_read(td_driver_t *driver, td_request_t treq) { struct vhd_state *s = (struct vhd_state *)driver->data; DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n", s->vhd.file, treq.sec, treq.secs, treq.sidx); while (treq.secs) { int err; td_request_t clone; err = 0; clone = treq; switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) { case -EINVAL: err = -EINVAL; goto fail; case VHD_BM_BAT_CLEAR: clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); td_forward_request(clone); break; case VHD_BM_BIT_CLEAR: clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0); td_forward_request(clone); break; case VHD_BM_BIT_SET: clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1); err = schedule_data_read(s, clone, 0); if (err) goto fail; break; case VHD_BM_NOT_CACHED: err = schedule_bitmap_read(s, clone.sec / s->spb); if (err) goto fail; clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone); if (err) goto fail; break; case VHD_BM_READ_PENDING: clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone); if (err) goto fail; break; case VHD_BM_BAT_LOCKED: default: ASSERT(0); break; } treq.sec += clone.secs; treq.secs -= clone.secs; treq.buf += vhd_sectors_to_bytes(clone.secs); continue; fail: clone.secs = treq.secs; td_complete_request(clone, err); break; } } static void vhd_queue_write(td_driver_t *driver, td_request_t treq) { struct vhd_state *s = (struct vhd_state *)driver->data; DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n", s->vhd.file, treq.sec, treq.secs, treq.sidx); while (treq.secs) { int err; uint8_t flags; td_request_t clone; err = 0; flags = 0; clone = treq; switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) { case -EINVAL: err = -EINVAL; goto fail; case VHD_BM_BAT_LOCKED: err = -EBUSY; goto fail; case VHD_BM_BAT_CLEAR: flags = (VHD_FLAG_REQ_UPDATE_BAT | VHD_FLAG_REQ_UPDATE_BITMAP); clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = schedule_data_write(s, clone, flags); if (err) goto fail; break; case VHD_BM_BIT_CLEAR: flags = VHD_FLAG_REQ_UPDATE_BITMAP; clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0); err = schedule_data_write(s, clone, flags); if (err) goto fail; break; case VHD_BM_BIT_SET: clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1); err = schedule_data_write(s, clone, 0); if (err) goto fail; break; case VHD_BM_NOT_CACHED: clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = schedule_bitmap_read(s, clone.sec / s->spb); if (err) goto fail; err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone); if (err) goto fail; break; case VHD_BM_READ_PENDING: clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone); if (err) goto fail; break; default: ASSERT(0); break; } treq.sec += clone.secs; treq.secs -= clone.secs; treq.buf += vhd_sectors_to_bytes(clone.secs); continue; fail: clone.secs = treq.secs; td_complete_request(clone, err); break; } } static inline void signal_completion(struct vhd_request *list, int error) { struct vhd_state *s; struct vhd_request *r, *next; if (!list) return; r = list; s = list->state; while (r) { int err; err = (error ? error : r->error); next = r->next; td_complete_request(r->treq, err); DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", " "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err); free_vhd_request(s, r); r = next; s->returned++; TRACE(s); } } static void start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm) { struct vhd_transaction *tx; struct vhd_request *r, *next; int i; if (!bm->queue.head) return; DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); r = bm->queue.head; tx = &bm->tx; clear_req_list(&bm->queue); if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED) tx->error = -EIO; while (r) { next = r->next; r->next = NULL; clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED); add_to_transaction(tx, r); if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) { tx->finished++; if (!r->error) { uint32_t sec = r->treq.sec % s->spb; for (i = 0; i < r->treq.secs; i++) vhd_bitmap_set(&s->vhd, bm->shadow, sec + i); } } r = next; } /* perhaps all the queued writes already completed? */ if (tx->started && transaction_completed(tx)) finish_data_transaction(s, bm); } static void finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm) { struct vhd_transaction *tx = &bm->tx; if (!bat_locked(s)) return; if (s->bat.pbw_blk != bm->blk) return; if (!s->bat.req.error) goto release; if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE)) goto release; tx->closed = 1; return; release: DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); unlock_bat(s); init_bat(s); } static void finish_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm, int error) { int map_size; struct vhd_transaction *tx = &bm->tx; DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error); tx->error = (tx->error ? tx->error : error); map_size = vhd_sectors_to_bytes(s->bm_secs); if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) { if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) { /* still waiting for bat write */ ASSERT(bm->blk == s->bat.pbw_blk); ASSERT(test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED)); s->bat.req.tx = tx; return; } } if (tx->error) { /* undo changes to shadow */ memcpy(bm->shadow, bm->map, map_size); } else { /* complete atomic write */ memcpy(bm->map, bm->shadow, map_size); if (!test_batmap(s, bm->blk) && bitmap_full(s, bm)) set_batmap(s, bm->blk); } /* transaction done; signal completions */ signal_completion(tx->requests.head, tx->error); init_tx(tx); start_new_bitmap_transaction(s, bm); if (!bitmap_in_use(bm)) unlock_bitmap(bm); finish_bat_transaction(s, bm); } static void finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm) { struct vhd_transaction *tx = &bm->tx; DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); tx->closed = 1; if (!tx->error) return schedule_bitmap_write(s, bm->blk); return finish_bitmap_transaction(s, bm, 0); } static void finish_bat_write(struct vhd_request *req) { struct vhd_bitmap *bm; struct vhd_transaction *tx; struct vhd_state *s = req->state; s->returned++; TRACE(s); bm = get_bitmap(s, s->bat.pbw_blk); DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n", s->bat.pbw_blk, s->bat.pbw_offset, req->error); ASSERT(bm && bitmap_valid(bm)); ASSERT(bat_locked(s) && test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED)); tx = &bm->tx; ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE)); if (!req->error) { bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset; s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs; } else tx->error = req->error; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) { tx->finished++; remove_from_req_list(&tx->requests, req); if (transaction_completed(tx)) finish_data_transaction(s, bm); } else { clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT); if (s->bat.req.tx) finish_bitmap_transaction(s, bm, req->error); } finish_bat_transaction(s, bm); } static void finish_zero_bm_write(struct vhd_request *req) { uint32_t blk; struct vhd_bitmap *bm; struct vhd_transaction *tx = req->tx; struct vhd_state *s = req->state; s->returned++; TRACE(s); blk = req->treq.sec / s->spb; bm = get_bitmap(s, blk); DBG(TLOG_DBG, "blk: 0x%04x\n", blk); ASSERT(bat_locked(s)); ASSERT(s->bat.pbw_blk == blk); ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm)); tx->finished++; remove_from_req_list(&tx->requests, req); if (req->error) { unlock_bat(s); init_bat(s); tx->error = req->error; clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT); } else schedule_bat_write(s); if (transaction_completed(tx)) finish_data_transaction(s, bm); } static int finish_redundant_bm_write(struct vhd_request *req) { /* uint32_t blk; */ struct vhd_state *s = (struct vhd_state *) req->state; s->returned++; TRACE(s); /* blk = req->treq.sec / s->spb; DBG(TLOG_DBG, "blk: %u\n", blk); */ if (req->error) { ERR(s, req->error, "lsec: 0x%08"PRIx64, req->treq.sec); } free_vhd_request(s, req); s->debug_done_redundant_writes++; return 0; } static void finish_bitmap_read(struct vhd_request *req) { uint32_t blk; struct vhd_bitmap *bm; struct vhd_request *r, *next; struct vhd_state *s = req->state; s->returned++; TRACE(s); blk = req->treq.sec / s->spb; bm = get_bitmap(s, blk); DBG(TLOG_DBG, "blk: 0x%04x\n", blk); ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)); r = bm->waiting.head; clear_req_list(&bm->waiting); clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); if (!req->error) { memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs)); while (r) { struct vhd_request tmp; tmp = *r; next = r->next; free_vhd_request(s, r); ASSERT(tmp.op == VHD_OP_DATA_READ || tmp.op == VHD_OP_DATA_WRITE); if (tmp.op == VHD_OP_DATA_READ) vhd_queue_read(s->driver, tmp.treq); else if (tmp.op == VHD_OP_DATA_WRITE) vhd_queue_write(s->driver, tmp.treq); r = next; } } else { int err = req->error; unlock_bitmap(bm); free_vhd_bitmap(s, bm); return signal_completion(r, err); } if (!bitmap_in_use(bm)) unlock_bitmap(bm); } static void finish_bitmap_write(struct vhd_request *req) { uint32_t blk; struct vhd_bitmap *bm; struct vhd_transaction *tx; struct vhd_state *s = req->state; s->returned++; TRACE(s); blk = req->treq.sec / s->spb; bm = get_bitmap(s, blk); tx = &bm->tx; DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n", blk, tx->started, tx->finished); ASSERT(tx->closed); ASSERT(bm && bitmap_valid(bm)); ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING)); clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING); finish_bitmap_transaction(s, bm, req->error); } static void finish_data_read(struct vhd_request *req) { struct vhd_state *s = req->state; DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", req->treq.sec, req->treq.sec / s->spb); signal_completion(req, 0); } static void finish_data_write(struct vhd_request *req) { int i; struct vhd_transaction *tx = req->tx; struct vhd_state *s = (struct vhd_state *)req->state; set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED); if (tx) { uint32_t blk, sec; struct vhd_bitmap *bm; blk = req->treq.sec / s->spb; sec = req->treq.sec % s->spb; bm = get_bitmap(s, blk); ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm)); tx->finished++; DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", " "tx->started: %d, tx->finished: %d\n", req->treq.sec, req->treq.sec / s->spb, tx->started, tx->finished); if (!req->error) for (i = 0; i < req->treq.secs; i++) vhd_bitmap_set(&s->vhd, bm->shadow, sec + i); if (transaction_completed(tx)) finish_data_transaction(s, bm); } else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) { ASSERT(!req->next); DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", req->treq.sec, req->treq.sec / s->spb); signal_completion(req, 0); } } void vhd_complete(void *arg, struct tiocb *tiocb, int err) { struct vhd_request *req = (struct vhd_request *)arg; struct vhd_state *s = req->state; struct iocb *io = &tiocb->iocb; s->completed++; TRACE(s); req->error = err; if (req->error) ERR(s, req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, " "nbytes: %lu, blk: %"PRIu64", blk_offset: %u", s->vhd.file, req->op, req->treq.sec, req->treq.secs, io->u.c.nbytes, req->treq.sec / s->spb, bat_entry(s, req->treq.sec / s->spb)); switch (req->op) { case VHD_OP_DATA_READ: finish_data_read(req); break; case VHD_OP_DATA_WRITE: finish_data_write(req); break; case VHD_OP_BITMAP_READ: finish_bitmap_read(req); break; case VHD_OP_BITMAP_WRITE: finish_bitmap_write(req); break; case VHD_OP_ZERO_BM_WRITE: finish_zero_bm_write(req); break; case VHD_OP_REDUNDANT_BM_WRITE: finish_redundant_bm_write(req); break; case VHD_OP_BAT_WRITE: finish_bat_write(req); break; default: ASSERT(0); break; } } void vhd_debug(td_driver_t *driver) { int i; struct vhd_state *s = (struct vhd_state *)driver->data; DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", " "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed, s->returned); DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n", s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0)); DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n", s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0)); DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%u total)\n", VHD_REQS_DATA); for (i = 0; i < VHD_REQS_DATA; i++) { struct vhd_request *r = &s->vreq_list[i]; td_request_t *t = &r->treq; const char *vname = t->vreq ? t->vreq->name: NULL; if (t->secs) DBG(TLOG_WARN, "%d: vreq: %s.%d, err: %d, op: %d," " lsec: 0x%08"PRIx64", flags: %d, this: %p, " "next: %p, tx: %p\n", i, vname, t->sidx, r->error, r->op, t->sec, r->flags, r, r->next, r->tx); } DBG(TLOG_WARN, "BITMAP CACHE:\n"); for (i = 0; i < VHD_CACHE_SIZE; i++) { int qnum = 0, wnum = 0, rnum = 0; struct vhd_bitmap *bm = s->bitmap[i]; struct vhd_transaction *tx; struct vhd_request *r; if (!bm) continue; tx = &bm->tx; r = bm->queue.head; while (r) { qnum++; r = r->next; } r = bm->waiting.head; while (r) { wnum++; r = r->next; } r = tx->requests.head; while (r) { rnum++; r = r->next; } DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, " "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, " "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n", i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head, wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error, tx->started, tx->finished, tx->status, tx->requests.head, rnum); } DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, " "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk, s->bat.pbw_offset, s->bat.req.tx); /* for (i = 0; i < s->hdr.max_bat_size; i++) DPRINTF("%d: %u\n", i, s->bat.bat[i]); */ } struct tap_disk tapdisk_vhd = { .disk_type = "tapdisk_vhd", .flags = 0, .private_data_size = sizeof(struct vhd_state), .td_open = _vhd_open, .td_close = _vhd_close, .td_queue_read = vhd_queue_read, .td_queue_write = vhd_queue_write, .td_get_parent_id = vhd_get_parent_id, .td_validate_parent = vhd_validate_parent, .td_debug = vhd_debug, };
_______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel