[Sheepdog] [PATCH RFC v3] Introduce block driver claim and release hooks

Chris Webb chris at arachsys.com
Fri Nov 27 14:43:47 CET 2009


During live migration, block drivers with exclusive locking behaviour [such as
Sheepdog: http://www.osrg.net/sheepdog/] are problematic, as both source and
destination need to have the device open simultaneously. However, the lock is
only required while the vm is running, and at most one vm is running at each
stage of migration. This patch introduces bdrv_claim and bdrv_release hooks
which can be used to claim and release the lock on vm start and stop, allowing
Sheepdog-backed guests to migrate.

This functionality could also be more generally useful. For example, it would
be possible to take fcntl() locks on qcow2 files, preventing corruption from
two qemu processes concurrently modifying qcow2 metadata. Doing this in
bdrv_open() is not possible as it would prevent live migration of guests
backed by qcow2 files on a shared filesystem.

Signed-off-by: <chris at arachsys.com>
---
 block.c     |  50 ++++++++++++++++++++++++++++++++++++++++++++------
 block.h     |   4 ++++
 block_int.h |   2 ++
 monitor.c   |   4 +++-
 qemu-img.c  |  31 +++++++++++++++++++++++++++----
 qemu-io.c   |  11 ++++++++++-
 qemu-kvm.c  |   2 ++
 qemu-nbd.c  |   3 ++-
 vl.c        |   7 ++++++-
 9 files changed, 100 insertions(+), 14 deletions(-)

diff --git a/block.c b/block.c
--- a/block.c
+++ b/block.c
@@ -475,6 +475,40 @@
     return 0;
 }
 
+int bdrv_claim(BlockDriverState *bs)
+{
+    if (bs->drv && bs->drv->bdrv_claim)
+        return bs->drv->bdrv_claim(bs);
+    return 0;
+}
+
+int bdrv_claim_all(void)
+{
+    BlockDriverState *bs;
+
+    for (bs = bdrv_first; bs != NULL; bs = bs->next) {
+        if (bdrv_claim(bs) < 0) {
+            bdrv_release_all();
+            return -1;
+        }
+    }
+    return 0;
+}
+
+void bdrv_release(BlockDriverState *bs)
+{
+    if (bs->drv && bs->drv->bdrv_release)
+        bs->drv->bdrv_release(bs);
+}
+
+void bdrv_release_all(void)
+{
+    BlockDriverState *bs;
+
+    for (bs = bdrv_first; bs != NULL; bs = bs->next)
+        bdrv_release(bs);
+}
+
 void bdrv_close(BlockDriverState *bs)
 {
     if (bs->drv) {
@@ -499,13 +533,10 @@
 
 void bdrv_close_all(void)
 {
-	BlockDriverState *bs, *n;
+    BlockDriverState *bs, *n;
 
-	for (bs = bdrv_first, n = bs->next; bs; bs = n, n = bs ? bs->next : NULL) {
-		if (bs && bs->drv && bs->drv->bdrv_close) {
-			bs->drv->bdrv_close(bs);
-		}
-	}
+    for (bs = bdrv_first, n = bs->next; bs; bs = n, n = bs ? bs->next : NULL)
+        bdrv_close(bs);
 }
 
 void bdrv_delete(BlockDriverState *bs)
@@ -555,15 +586,20 @@
 	return -ENOTSUP;
     }
 
+    if (bdrv_claim(bs->backing_hd) < 0)
+        return -EACCES;
+
     total_sectors = bdrv_getlength(bs) >> SECTOR_BITS;
     for (i = 0; i < total_sectors;) {
         if (drv->bdrv_is_allocated(bs, i, 65536, &n)) {
             for(j = 0; j < n; j++) {
                 if (bdrv_read(bs, i, sector, 1) != 0) {
+                    bdrv_release(bs->backing_hd);
                     return -EIO;
                 }
 
                 if (bdrv_write(bs->backing_hd, i, sector, 1) != 0) {
+                    bdrv_release(bs->backing_hd);
                     return -EIO;
                 }
                 i++;
@@ -573,6 +609,8 @@
         }
     }
 
+    bdrv_release(bs->backing_hd);
+
     if (drv->bdrv_make_empty)
 	return drv->bdrv_make_empty(bs);
 
diff --git a/block.h b/block.h
--- a/block.h
+++ b/block.h
@@ -58,6 +58,10 @@
 int bdrv_open(BlockDriverState *bs, const char *filename, int flags);
 int bdrv_open2(BlockDriverState *bs, const char *filename, int flags,
                BlockDriver *drv);
+int bdrv_claim(BlockDriverState *bs);
+int bdrv_claim_all(void);
+void bdrv_release(BlockDriverState *bs);
+void bdrv_release_all(void);
 void bdrv_close(BlockDriverState *bs);
 int bdrv_check(BlockDriverState *bs);
 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
diff --git a/block_int.h b/block_int.h
--- a/block_int.h
+++ b/block_int.h
@@ -51,6 +51,8 @@
     int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
     int (*bdrv_probe_device)(const char *filename);
     int (*bdrv_open)(BlockDriverState *bs, const char *filename, int flags);
+    int (*bdrv_claim)(BlockDriverState *bs);
+    void (*bdrv_release)(BlockDriverState *bs);
     int (*bdrv_read)(BlockDriverState *bs, int64_t sector_num,
                      uint8_t *buf, int nb_sectors);
     int (*bdrv_write)(BlockDriverState *bs, int64_t sector_num,
diff --git a/monitor.c b/monitor.c
--- a/monitor.c
+++ b/monitor.c
@@ -456,7 +456,9 @@
 
 static void do_quit(Monitor *mon, const QDict *qdict)
 {
-	bdrv_close_all();
+    if (vm_running)
+        bdrv_release_all();
+    bdrv_close_all();
     exit(0);
 }
 
diff --git a/qemu-img.c b/qemu-img.c
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -471,7 +471,13 @@
     if (bdrv_open2(bs, filename, BRDV_O_FLAGS, drv) < 0) {
         error("Could not open '%s'", filename);
     }
+    if (bdrv_claim(bs) < 0) {
+        error("Could not claim '%s'", filename);
+    }
     ret = bdrv_commit(bs);
+    bdrv_release(bs);
+    bdrv_delete(bs);
+
     switch(ret) {
     case 0:
         printf("Image committed.\n");
@@ -490,7 +496,6 @@
         break;
     }
 
-    bdrv_delete(bs);
     return 0;
 }
 
@@ -654,6 +659,8 @@
     }
 
     out_bs = bdrv_new_open(out_filename, out_fmt);
+    if (bdrv_claim(out_bs) < 0)
+        error("Unable to claim '%s'", out_filename);
 
     bs_i = 0;
     bs_offset = 0;
@@ -790,6 +797,7 @@
             }
         }
     }
+    bdrv_release(out_bs);
     bdrv_delete(out_bs);
     for (bs_i = 0; bs_i < bs_n; bs_i++)
         bdrv_delete(bs[bs_i]);
@@ -994,6 +1002,10 @@
         error("Could not open '%s'", filename);
     }
 
+    if (action != SNAPSHOT_LIST && bdrv_claim(bs) < 0) {
+        error("Could not claim '%s'", filename);
+    }
+
     /* Perform the requested action */
     switch(action) {
     case SNAPSHOT_LIST:
@@ -1009,27 +1021,38 @@
         sn.date_nsec = tv.tv_usec * 1000;
 
         ret = bdrv_snapshot_create(bs, &sn);
-        if (ret)
+        if (ret) {
+            bdrv_release(bs);
+            bdrv_delete(bs);
             error("Could not create snapshot '%s': %d (%s)",
                 snapshot_name, ret, strerror(-ret));
+        }
         break;
 
     case SNAPSHOT_APPLY:
         ret = bdrv_snapshot_goto(bs, snapshot_name);
-        if (ret)
+        if (ret) {
+            bdrv_release(bs);
+            bdrv_delete(bs);
             error("Could not apply snapshot '%s': %d (%s)",
                 snapshot_name, ret, strerror(-ret));
+        }
         break;
 
     case SNAPSHOT_DELETE:
         ret = bdrv_snapshot_delete(bs, snapshot_name);
-        if (ret)
+        if (ret) {
+            bdrv_release(bs);
+            bdrv_delete(bs);
             error("Could not delete snapshot '%s': %d (%s)",
                 snapshot_name, ret, strerror(-ret));
+        }
         break;
     }
 
     /* Cleanup */
+    if (action != SNAPSHOT_LIST)
+        bdrv_release(bs);
     bdrv_delete(bs);
 
     return 0;
diff --git a/qemu-io.c b/qemu-io.c
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -1287,6 +1287,13 @@
 		return 1;
 	}
 
+	if (bdrv_claim(bs) < 0) {
+		fprintf(stderr, "%s: can't claim device %s\n", progname, name);
+		bdrv_close(bs);
+		bs = NULL;
+		return 1;
+	}
+
 	if (growable) {
 		bs->growable = 1;
 	}
@@ -1517,7 +1524,9 @@
 	 */
 	qemu_aio_flush();
 
-	if (bs)
+	if (bs) {
+		bdrv_release(bs);
 		bdrv_close(bs);
+	}
 	return 0;
 }
diff --git a/qemu-kvm.c b/qemu-kvm.c
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -2215,6 +2215,8 @@
         }
     }
 
+    if (vm_running)
+        bdrv_release_all();
     bdrv_close_all();
 
     pause_all_threads();
diff --git a/qemu-nbd.c b/qemu-nbd.c
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -331,7 +331,7 @@
     if (bs == NULL)
         return 1;
 
-    if (bdrv_open(bs, argv[optind], flags) == -1)
+    if (bdrv_open(bs, argv[optind], flags) < 0 || bdrv_claim(bs) < 0)
         return 1;
 
     fd_size = bs->total_sectors * 512;
@@ -470,6 +470,7 @@
     qemu_free(data);
 
     close(sharing_fds[0]);
+    bdrv_release(bs);
     bdrv_close(bs);
     qemu_free(sharing_fds);
     if (socket)
diff --git a/vl.c b/vl.c
--- a/vl.c
+++ b/vl.c
@@ -3219,7 +3219,7 @@
 
 void vm_start(void)
 {
-    if (!vm_running) {
+    if (!vm_running && bdrv_claim_all() >= 0) {
         cpu_enable_ticks();
         vm_running = 1;
         vm_state_notify(1, 0);
@@ -3293,6 +3293,7 @@
         vm_running = 0;
         pause_all_vcpus();
         vm_state_notify(0, reason);
+        bdrv_release_all();
     }
 }
 
@@ -4178,6 +4179,10 @@
         if ((r = qemu_vmstop_requested()))
             vm_stop(r);
     }
+
+    if (vm_running)
+        bdrv_release_all();
+    bdrv_close_all();
     pause_all_vcpus();
 }
 





More information about the sheepdog mailing list