[Sheepdog] [PATCH] sheepdog: recover unaccessed data objects

FUJITA Tomonori fujita.tomonori at lab.ntt.co.jp
Mon Nov 16 16:17:17 CET 2009


I've not tested the recovery code yet but seems it doesn't break the
normal path.

=
From: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
Subject: [PATCH] sheepdog: recover unaccessed data objects

Currently, only when qemu (a guest OS) accesses to an objects, we
recover the object if necessary (that is, the hosts that keep the
object were lost).

This adds the feature to periodically see if we need to recover data
objects and recover them if necessary.

Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
 block/sheepdog.c |  146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-tool.c      |   13 +++++
 2 files changed, 159 insertions(+), 0 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 99b2708..0195021 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -26,6 +26,7 @@
 
 #include "qemu-common.h"
 #include "block_int.h"
+#include "qemu-timer.h"
 
 #define DOG_PORT 7000
 
@@ -250,6 +251,8 @@ struct sd_aiocb {
 	int write;
 
 	QLIST_HEAD(aioreq_head, aio_req) aioreq_head;
+
+	QLIST_ENTRY(sd_aiocb) aiocb_siblings;
 };
 
 struct sd_aiostate {
@@ -261,12 +264,25 @@ struct sd_aiostate {
 	int nr_aio_req_free;
 };
 
+#define BIT_PER_LONG (sizeof(long) * 8)
+
 struct bdrv_sd_state {
 	struct sd_inode inode;
 
 	char *name;
 	int is_current;
 
+	int recovering;
+	int stop_recovering;
+	unsigned long recovering_idx;
+	long last_recovering_time;
+
+	QEMUIOVector rqiov;
+	QEMUTimer *ts;
+
+	QLIST_HEAD(pending_aiocb_head, sd_aiocb) pending_aiocb_head;
+	QLIST_HEAD(running_aiocb_head, sd_aiocb) running_aiocb_head;
+
 	uint32_t obj_epoch_array[MAX_DATA_OBJS];
 
 	struct sd_aiostate aio_state_array[FD_SETSIZE];
@@ -283,6 +299,8 @@ static uint32_t s_epoch;
 static int nr_nodes;
 static struct sheepdog_node_list_entry *node_list_entries;
 
+static void start_recovering_objs(void *);
+
 static const char * sd_strerror(int err)
 {
 	int i;
@@ -428,6 +446,7 @@ static inline struct aio_req *get_req_from_id(struct sd_aiostate *s, int id)
 static void sd_finish_aiocb(struct sd_aiocb *acb)
 {
 	acb->common.cb(acb->common.opaque, acb->ret);
+	QLIST_REMOVE(acb, aiocb_siblings);
 	qemu_aio_release(acb);
 }
 
@@ -1308,6 +1327,9 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
 	unsigned int epoch;
 	char *buf;
 
+	QLIST_INIT(&s->pending_aiocb_head);
+	QLIST_INIT(&s->running_aiocb_head);
+
 	buf = malloc(SD_INODE_SIZE);
 	if (!buf) {
 		eprintf("Failed to allocate memory\n");
@@ -1357,6 +1379,11 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
 	memcpy(s->obj_epoch_array, buf + sizeof(s->inode),
 	       sizeof(s->obj_epoch_array));
 
+	s->ts = qemu_new_timer(rt_clock, start_recovering_objs, bs);
+
+	if (!for_snapshot && rt_clock)
+		start_recovering_objs(bs);
+
 	bs->total_sectors = s->inode.vdi_size >> 9;
 	s->name = strdup(vdi);
 	free(buf);
@@ -1494,6 +1521,12 @@ static void sd_close(BlockDriverState *bs)
 	if (!ret && rsp->result != SD_RES_SUCCESS)
 		eprintf("%s, %s\n", sd_strerror(rsp->result), s->name);
 
+	/* do better */
+	s->stop_recovering = 1;
+	while (s->recovering)
+		sleep(2);
+	qemu_free_timer(s->ts);
+
 	free(s->name);
 }
 
@@ -1594,6 +1627,24 @@ out:
 	return ret;
 }
 
+static int conflict_recovering(struct bdrv_sd_state *s,
+			       struct sd_aiocb *acb,
+			       unsigned long ridx)
+{
+	int i;
+	unsigned long idx = acb->sector_num * 512 / CHUNK_SIZE;
+
+	if (!s->recovering)
+		return 0;
+
+	for (i = 0; i < nr_chunks(acb); i++) {
+		if (idx + i == ridx)
+			return 1;
+	}
+
+	return 0;
+}
+
 static void sd_write_bh_cb(void *p)
 {
 	struct sd_aiocb *acb = p;
@@ -1612,6 +1663,13 @@ static void sd_write_bh_cb(void *p)
 		acb->bh = NULL;
 	}
 
+	if (conflict_recovering(s, acb, s->recovering_idx)) {
+		QLIST_INSERT_HEAD(&s->pending_aiocb_head, acb, aiocb_siblings);
+		return;
+	}
+
+	QLIST_INSERT_HEAD(&s->running_aiocb_head, acb, aiocb_siblings);
+
 	if (acb->retries > MAX_RETRIES) {
 		ret = -EIO;
 		goto abort;
@@ -1809,6 +1867,13 @@ static void sd_readv_bh_cb(void *p)
 	qemu_bh_delete(acb->bh);
 	acb->bh = NULL;
 
+	if (conflict_recovering(s, acb, s->recovering_idx)) {
+		QLIST_INSERT_HEAD(&s->pending_aiocb_head, acb, aiocb_siblings);
+		return;
+	}
+
+	QLIST_INSERT_HEAD(&s->running_aiocb_head, acb, aiocb_siblings);
+
 	if (acb->retries > MAX_RETRIES) {
 		ret = -EIO;
 		goto abort;
@@ -1891,6 +1956,87 @@ static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs,
 	return &acb->common;
 }
 
+static void recovering_done(void *opaque, int ret)
+{
+	BlockDriverState *bs = opaque;
+	struct bdrv_sd_state *s = bs->opaque;
+	struct sd_aiocb *acb, *next;
+
+	if (!ret)
+		s->recovering_idx++;
+
+	s->recovering = 0;
+	if (s->stop_recovering)
+		return;
+
+	start_recovering_objs(bs);
+
+	QLIST_FOREACH_SAFE(acb, &s->pending_aiocb_head, aiocb_siblings, next) {
+		/*
+		 * we can choose only appropriate acb here but we'll
+		 * check in sd_readv_bh_cb or sd_write_bh_cb anyway.
+		 */
+
+		QLIST_REMOVE(acb, aiocb_siblings);
+
+		if (acb->aiocb_type == AIOCB_READ_EPOCH)
+			sd_schedule_bh(sd_readv_bh_cb, acb);
+		else
+			sd_schedule_bh(sd_write_bh_cb, acb);
+	}
+}
+
+static void start_recovering_objs(void *opaque)
+{
+	BlockDriverState *bs = opaque;
+	struct bdrv_sd_state *s = bs->opaque;
+	int i, idx;
+	QEMUIOVector *qiov = &s->rqiov;
+	static BlockDriverAIOCB *bdaiocb;
+	char *buf;
+	struct sd_aiocb *acb;
+
+	if (s->recovering)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(s->obj_epoch_array); i++) {
+		idx = (s->recovering_idx + i) % ARRAY_SIZE(s->obj_epoch_array);
+
+		if (s->inode.data_oid[idx] &&
+		    is_data_obj_writeable(s->inode.oid, s->inode.data_oid[idx]) &&
+		    s->obj_epoch_array[idx] != s_epoch) {
+
+			QLIST_FOREACH(acb, &s->running_aiocb_head, aiocb_siblings) {
+				if (conflict_recovering(s, acb, idx))
+					continue;
+			}
+
+			if (!qiov->iov) {
+				buf = malloc(SD_DATA_OBJ_SIZE);
+				if (!buf)
+					return;
+
+				qemu_iovec_init(qiov, 1);
+				qemu_iovec_add(qiov, buf, SD_DATA_OBJ_SIZE);
+			}
+
+			bdaiocb = sd_aio_readv(bs, idx * SD_DATA_OBJ_SIZE / 512,
+					       qiov, SD_DATA_OBJ_SIZE / 512,
+					       recovering_done, bs);
+			if (bdaiocb) {
+				s->recovering = 1;
+				s->recovering_idx = idx;
+				return;
+			}
+
+			break;
+		}
+	}
+
+	s->recovering = 0;
+	qemu_mod_timer(s->ts, qemu_get_clock(rt_clock) + 10000);
+}
+
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 {
 	struct bdrv_sd_state *s = bs->opaque;
diff --git a/qemu-tool.c b/qemu-tool.c
index ba24aa2..f058ae0 100644
--- a/qemu-tool.c
+++ b/qemu-tool.c
@@ -88,3 +88,16 @@ int64_t qemu_get_clock(QEMUClock *clock)
     qemu_gettimeofday(&tv);
     return (tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000)) / 1000000;
 }
+
+QEMUTimer *qemu_new_timer(QEMUClock *clock, QEMUTimerCB *cb, void *opaque)
+{
+	return NULL;
+}
+
+void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)
+{
+}
+
+void qemu_free_timer(QEMUTimer *ts)
+{
+}
-- 
1.5.6.5




More information about the sheepdog mailing list