This is v3 of the lock patch, previously discussed here:

http://lists.gnu.org/archive/html/qemu-devel/2009-12/threads.html#00461

In this version I've reverted back to the simpler interface.  There is
now only one "lock" option, which can be lock=exclusive|shared|none.

At Kevin Wolf's suggestion,
lock=exclusive|shared => all backing disks are locked shared
lock=none => no locks are acquired on any front or back disks

In order to mitigate the problem with locks during live migration,
I've added a lock command to the monitor, which currently allows you
to acquire (but not revoke) a lock.  (Revocation could be added fairly
easily too.)  This should allow the management tool to start the qemu
destination process without locking, and lock it once migration is
complete.

To keep the implementation very simple, the "commit" doesn't try to
acquire any extra locks.  The "commit" command should probably never
be used when a backing file could be shared.

In general I've attempted to keep the patch simple, but with room for
future extension.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
virt-top is 'top' for virtual machines.  Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://et.redhat.com/~rjones/virt-top
>From 85eedb171016efa769c85ec1f9ee9188a7f79978 Mon Sep 17 00:00:00 2001
From: Richard Jones <rjo...@redhat.com>
Date: Tue, 8 Dec 2009 13:09:31 +0000
Subject: [PATCH] Disk image exclusive and shared locks.

Allow qemu to acquire shared and exclusive locks on disk images.
This is done by extending the -drive option with an additional,
optional parameter:

  -drive [...],lock=none
  -drive [...],lock=exclusive
  -drive [...],lock=shared

lock=none is the default, and it means that we don't try to acquire
any sort of lock.

lock=exclusive tries to acquire an exclusive lock on the disk
image.

lock=shared tries to acquire a shared lock on the disk image.
Multiple instances of qemu may all hold this sort of lock.

If acquisition of a lock fails, opening the image fails.

For disk types that have backing disks (eg. qcow2), the backing
disk is opened with a shared lock if the front disk has any sort
of lock.  Otherwise the backing disk is not locked.  Users must
be cautious with the "commit" command which doesn't try to acquire
an exclusive lock on the backing disk, but in any case it's almost
never safe to use the "commit" command on any shared backing disk.

Also implemented is a "lock" command in the monitor, allowing
you to acquire (but not revoke) a lock after qemu has started.

The implementation of locks only works for raw POSIX and Win32
files.  However many of the other block types are implemented
in terms of these drivers, so they "inherit" locking too.  Other
drivers are read-only, so don't require locking.  Below we note
only the cases where locking is *not* implemented:

  cloop - directly open()s the file, no locking implemented
  cow - same as cloop
  curl - protocol probably doesn't support locking
  nbd - same as curl

Signed-off-by: Richard W.M. Jones <rjo...@redhat.com>
---
 block.c           |   19 +++++++++++++++++--
 block.h           |    4 ++++
 block/raw-posix.c |   25 +++++++++++++++++++++++++
 block/raw-win32.c |   31 +++++++++++++++++++++++++++++++
 block_int.h       |    1 +
 monitor.c         |   32 ++++++++++++++++++++++++++++++++
 qemu-config.c     |    4 ++++
 qemu-monitor.hx   |   19 +++++++++++++++++++
 qemu-options.hx   |    7 +++++++
 qerror.c          |    4 ++++
 qerror.h          |    3 +++
 vl.c              |   16 ++++++++++++++++
 12 files changed, 163 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 3f3496e..0e69ba8 100644
--- a/block.c
+++ b/block.c
@@ -449,7 +449,7 @@ int bdrv_open2(BlockDriverState *bs, const char *filename, 
int flags,
     try_rw = !bs->read_only || bs->is_temporary;
     if (!(flags & BDRV_O_FILE))
         open_flags = (try_rw ? BDRV_O_RDWR : 0) |
-            (flags & (BDRV_O_CACHE_MASK|BDRV_O_NATIVE_AIO));
+            (flags & (BDRV_O_CACHE_MASK|BDRV_O_NATIVE_AIO|BDRV_O_LOCK_MASK));
     else
         open_flags = flags & ~(BDRV_O_FILE | BDRV_O_SNAPSHOT);
     if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv))
@@ -480,14 +480,19 @@ int bdrv_open2(BlockDriverState *bs, const char 
*filename, int flags,
     if (bs->backing_file[0] != '\0') {
         /* if there is a backing file, use it */
         BlockDriver *back_drv = NULL;
+        int back_drv_open_flags = open_flags;
         bs->backing_hd = bdrv_new("");
         /* pass on read_only property to the backing_hd */
         bs->backing_hd->read_only = bs->read_only;
+        /* if front disk is locked, lock backing disk shared */
+        back_drv_open_flags &= ~BDRV_O_LOCK_MASK;
+        if (open_flags & BDRV_O_LOCK_MASK)
+            back_drv_open_flags |= BDRV_O_LOCK_SHARED;
         path_combine(backing_filename, sizeof(backing_filename),
                      filename, bs->backing_file);
         if (bs->backing_format[0] != '\0')
             back_drv = bdrv_find_format(bs->backing_format);
-        ret = bdrv_open2(bs->backing_hd, backing_filename, open_flags,
+        ret = bdrv_open2(bs->backing_hd, backing_filename, back_drv_open_flags,
                          back_drv);
         if (ret < 0) {
             bdrv_close(bs);
@@ -1388,6 +1393,16 @@ int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo 
*bdi)
     return drv->bdrv_get_info(bs, bdi);
 }
 
+int bdrv_acquire_lock(BlockDriverState *bs, int lock_flags)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (!drv->bdrv_acquire_lock)
+        return -ENOTSUP;
+    return drv->bdrv_acquire_lock(bs, lock_flags);
+}
+
 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
                       int64_t pos, int size)
 {
diff --git a/block.h b/block.h
index fa51ddf..bd15bbe 100644
--- a/block.h
+++ b/block.h
@@ -39,8 +39,11 @@ typedef struct QEMUSnapshotInfo {
 #define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
 #define BDRV_O_CACHE_WB    0x0040 /* use write-back caching */
 #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool 
*/
+#define BDRV_O_LOCK_SHARED 0x0100 /* fail unless we can lock shared */
+#define BDRV_O_LOCK_EXCLUSIVE 0x0200 /* fail unless we can lock exclusive */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB)
+#define BDRV_O_LOCK_MASK   (BDRV_O_LOCK_SHARED | BDRV_O_LOCK_EXCLUSIVE)
 
 #define BDRV_SECTOR_BITS   9
 #define BDRV_SECTOR_SIZE   (1 << BDRV_SECTOR_BITS)
@@ -170,6 +173,7 @@ const char *bdrv_get_device_name(BlockDriverState *bs);
 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
                           const uint8_t *buf, int nb_sectors);
 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+int bdrv_acquire_lock(BlockDriverState *bs, int lock_flags);
 
 const char *bdrv_get_encrypted_filename(BlockDriverState *bs);
 void bdrv_get_backing_filename(BlockDriverState *bs,
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 5a6a22b..4c3326c 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -128,6 +128,24 @@ static int64_t raw_getlength(BlockDriverState *bs);
 static int cdrom_reopen(BlockDriverState *bs);
 #endif
 
+static int raw_acquire_lock(BlockDriverState *bs, int lock_flags)
+{
+    BDRVRawState *s = bs->opaque;
+    struct flock lk;
+
+    if (lock_flags & BDRV_O_LOCK_SHARED)
+        lk.l_type = F_RDLCK;
+    else /* bdrv_flags & BDRV_O_LOCK_EXCLUSIVE */
+        lk.l_type = F_WRLCK;
+    lk.l_whence = SEEK_SET;
+    lk.l_start = 0;
+    lk.l_len = 0;               /* means lock the whole file */
+
+    if (fcntl (s->fd, F_SETLK, &lk) == -1)
+        return -errno;
+    return 0;
+}
+
 static int raw_open_common(BlockDriverState *bs, const char *filename,
                            int bdrv_flags, int open_flags)
 {
@@ -163,6 +181,11 @@ static int raw_open_common(BlockDriverState *bs, const 
char *filename,
     s->fd = fd;
     s->aligned_buf = NULL;
 
+    if (bdrv_flags & BDRV_O_LOCK_MASK) {
+        if (raw_acquire_lock(bs, bdrv_flags & BDRV_O_LOCK_MASK) < 0)
+            goto out_close;
+    }
+
     if ((bdrv_flags & BDRV_O_NOCACHE)) {
         s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
         if (s->aligned_buf == NULL) {
@@ -768,6 +791,8 @@ static BlockDriver bdrv_raw = {
     .bdrv_truncate = raw_truncate,
     .bdrv_getlength = raw_getlength,
 
+    .bdrv_acquire_lock = raw_acquire_lock,
+
     .create_options = raw_create_options,
 };
 
diff --git a/block/raw-win32.c b/block/raw-win32.c
index 72acad5..6f89f03 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -73,6 +73,27 @@ static int set_sparse(int fd)
                                 NULL, 0, NULL, 0, &returned, NULL);
 }
 
+static int raw_acquire_lock(BlockDriverState *bs, int lock_flags)
+{
+    BDRVRawState *s = bs->opaque;
+    DWORD flags;
+    OVERLAPPED ov;
+
+    flags = LOCKFILE_FAIL_IMMEDIATELY;
+    if (lock_flags & BDRV_O_LOCK_EXCLUSIVE)
+        flags |= LOCKFILE_EXCLUSIVE_LOCK;
+
+    memset(&ov, 0, sizeof(ov));
+    ov.Offset = 0;
+    ov.OffsetHigh = 0;
+
+    if (!LockFileEx(s->hfile, flags, 0, 1, 0, &ov))
+        /* For compatibility with the POSIX lock failure ... */
+        return -EAGAIN;
+
+    return 0;
+}
+
 static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
@@ -106,6 +127,15 @@ static int raw_open(BlockDriverState *bs, const char 
*filename, int flags)
             return -EACCES;
         return -1;
     }
+
+    if (flags & BDRV_O_LOCK_MASK) {
+        int err;
+
+        err = raw_acquire_lock(s, flags & BDRV_O_LOCK_MASK);
+        if (err < 0)
+            return err;
+    }
+
     return 0;
 }
 
@@ -253,6 +283,7 @@ static BlockDriver bdrv_raw = {
     .bdrv_write                = raw_write,
     .bdrv_truncate     = raw_truncate,
     .bdrv_getlength    = raw_getlength,
+    .bdrv_acquire_lock = raw_acquire_lock,
 
     .create_options = raw_create_options,
 };
diff --git a/block_int.h b/block_int.h
index 9a3b2e0..729d540 100644
--- a/block_int.h
+++ b/block_int.h
@@ -92,6 +92,7 @@ struct BlockDriver {
     int (*bdrv_snapshot_list)(BlockDriverState *bs,
                               QEMUSnapshotInfo **psn_info);
     int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
+    int (*bdrv_acquire_lock)(BlockDriverState *bs, int lock_flags);
 
     int (*bdrv_save_vmstate)(BlockDriverState *bs, const uint8_t *buf,
                              int64_t pos, int size);
diff --git a/monitor.c b/monitor.c
index d97d529..d00d5d4 100644
--- a/monitor.c
+++ b/monitor.c
@@ -890,6 +890,38 @@ static void do_block_set_passwd(Monitor *mon, const QDict 
*qdict,
     }
 }
 
+static void do_block_lock(Monitor *mon, const QDict *qdict,
+                          QObject **ret_data)
+{
+    BlockDriverState *bs;
+    const char *p;
+    int lock_flags;
+
+    bs = bdrv_find(qdict_get_str(qdict, "device"));
+    if (!bs) {
+        qemu_error_new(QERR_DEVICE_NOT_FOUND, qdict_get_str(qdict, "device"));
+        return;
+    }
+
+    p = qdict_get_str(qdict, "mode");
+    if (!p) {
+        qemu_error_new(QERR_MISSING_PARAMETER, "mode");
+        return;
+    }
+    if (!strcmp(p, "exclusive"))
+        lock_flags = BDRV_O_LOCK_EXCLUSIVE;
+    else if (!strcmp (p, "shared"))
+        lock_flags = BDRV_O_LOCK_SHARED;
+    else {
+        qemu_error_new(QERR_INVALID_PARAMETER, p);
+        return;
+    }
+
+    if (bdrv_acquire_lock(bs, lock_flags) < 0) {
+        qemu_error_new(QERR_LOCK_ACQUIRE_FAILED);
+    }
+}
+
 static void do_change_block(Monitor *mon, const char *device,
                             const char *filename, const char *fmt)
 {
diff --git a/qemu-config.c b/qemu-config.c
index c3203c8..df0d3fb 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -78,6 +78,10 @@ QemuOptsList qemu_drive_opts = {
         },{
             .name = "readonly",
             .type = QEMU_OPT_BOOL,
+        },{
+            .name = "lock",
+            .type = QEMU_OPT_STRING,
+            .help = "lock disk image (exclusive, shared, none)",
         },
         { /* end if list */ }
     },
diff --git a/qemu-monitor.hx b/qemu-monitor.hx
index c788c73..9474905 100644
--- a/qemu-monitor.hx
+++ b/qemu-monitor.hx
@@ -1063,6 +1063,25 @@ STEXI
 Set the encrypted device @var{device} password to @var{password}
 ETEXI
 
+    {
+        .name       = "lock",
+        .args_type  = "device:B,mode:s",
+        .params     = "device [exclusive|shared]",
+        .help       = "acquire a lock on an existing device",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_block_lock,
+    },
+
+STEXI
+...@item lock @var{device} @var{mode}
+Acquire a lock on @var{device}.  The @var{mode} string should be
+...@var{exclusive} to acquire an exclusive lock, or @var{shared} to
+acquire a shared lock.
+
+This does not attempt to lock the backing disk for formats like
+qcow2 that can have backing storage.
+ETEXI
+
 STEXI
 @end table
 ETEXI
diff --git a/qemu-options.hx b/qemu-options.hx
index b8cc375..efc5f19 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -104,6 +104,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
     "       [,cache=writethrough|writeback|none][,format=f][,serial=s]\n"
     "       [,addr=A][,id=name][,aio=threads|native]\n"
+    "       [,lock=exclusive|shared|none]\n"
     "                use 'file' as a drive image\n")
 DEF("set", HAS_ARG, QEMU_OPTION_set,
     "-set group.id.arg=value\n"
@@ -149,6 +150,12 @@ an untrusted format header.
 This option specifies the serial number to assign to the device.
 @item ad...@var{addr}
 Specify the controller's PCI address (if=virtio only).
+...@item lo...@var{mode}
+Acquire a lock on the disk image (@var{file}).
+Available modes are: exclusive, shared, none.
+The default is "none", meaning we don't try to acquire a lock.  To
+avoid multiple virtual machines trying to write to a disk at the
+same time (which can cause disk corruption), use lock=exclusive.
 @end table
 
 By default, writethrough caching is used for all block device.  This means that
diff --git a/qerror.c b/qerror.c
index 5f8fc5d..8b74a58 100644
--- a/qerror.c
+++ b/qerror.c
@@ -120,6 +120,10 @@ static const QErrorStringTable qerror_table[] = {
         .error_fmt = QERR_VNC_SERVER_FAILED,
         .desc      = "Could not start VNC server on %(target)",
     },
+    {
+        .error_fmt = QERR_LOCK_ACQUIRE_FAILED,
+        .desc      = "Could not lock device with requested mode",
+    },
     {}
 };
 
diff --git a/qerror.h b/qerror.h
index 9e220d6..540bf8d 100644
--- a/qerror.h
+++ b/qerror.h
@@ -100,4 +100,7 @@ QError *qobject_to_qerror(const QObject *obj);
 #define QERR_VNC_SERVER_FAILED \
     "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }"
 
+#define QERR_LOCK_ACQUIRE_FAILED \
+    "{ 'class': 'LockAcquireFailed', 'data': {} }"
+
 #endif /* QERROR_H */
diff --git a/vl.c b/vl.c
index c0d98f5..b114518 100644
--- a/vl.c
+++ b/vl.c
@@ -2110,6 +2110,7 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque,
     const char *devaddr;
     DriveInfo *dinfo;
     int snapshot = 0;
+    int lock_flags = 0;
 
     *fatal_error = 1;
 
@@ -2300,6 +2301,19 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque,
         }
     }
 
+    if ((buf = qemu_opt_get(opts, "lock")) != NULL) {
+        if (!strcmp(buf, "none"))
+            /* nothing */;
+        else if (!strcmp(buf, "shared"))
+            lock_flags |= BDRV_O_LOCK_SHARED;
+        else if (!strcmp(buf, "exclusive"))
+            lock_flags |= BDRV_O_LOCK_EXCLUSIVE;
+        else {
+           fprintf(stderr, "qemu: invalid lock option\n");
+           return NULL;
+        }
+    }
+
     /* compute bus and unit according index */
 
     if (index != -1) {
@@ -2444,6 +2458,8 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque,
         (void)bdrv_set_read_only(dinfo->bdrv, 1);
     }
 
+    bdrv_flags |= lock_flags;
+
     if (bdrv_open2(dinfo->bdrv, file, bdrv_flags, drv) < 0) {
         fprintf(stderr, "qemu: could not open disk image %s: %s\n",
                         file, strerror(errno));
-- 
1.6.5.2

Reply via email to