[Sheepdog] [PATCH] collie: fix object consistency when qemu died during write operation

Sun Apr 18 20:32:59 CEST 2010

If qemu dies during write object operation, consistency of the replicated
objects would be broken. This patch fixes the broken consistency in the
object recovery routines.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/collie.h |    2 +-
 collie/group.c  |   30 ++++++++++++++++++++++++++++--
 collie/store.c  |   55 ++++++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index 826ac3a..734b1a4 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -130,7 +130,7 @@ int remove_epoch(int epoch);
 int set_cluster_ctime(uint64_t ctime);
 uint64_t get_cluster_ctime(void);
 
-int start_recovery(uint32_t epoch);
+int start_recovery(uint32_t epoch, unsigned long *failed_vdis, int nr_failed_vdis);
 int start_deletion(uint64_t oid);
 
 static inline int is_myself(struct sheepdog_node_list_entry *e)
diff --git a/collie/group.c b/collie/group.c
index 1f5544a..d8ebad9 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -90,6 +90,9 @@ struct work_confch {
 	struct cpg_address *joined_list;
 	size_t joined_list_entries;
 
+	unsigned long *failed_vdis;
+	int nr_failed_vdis;
+
 	struct work work;
 };
 
@@ -881,7 +884,7 @@ static void __sd_deliver_done(struct work *work, int idx)
 	 */
 
 	if (m->state == DM_FIN && m->op == SD_MSG_JOIN && sys->epoch >= 2)
-		start_recovery(sys->epoch);
+		start_recovery(sys->epoch, NULL, 0);
 
 	free(w->msg);
 	free(w);
@@ -1051,7 +1054,12 @@ static void __sd_confch(struct work *work, int idx)
 			int nr;
 			struct sheepdog_node_list_entry e[SD_MAX_NODES];
 			struct vm *vm, *n;
+			int ret, size;
+			uint64_t oid;
+			void *buf;
 
+			size = sizeof(*w->failed_vdis) * 64;
+			w->failed_vdis = malloc(size);
 			list_for_each_entry_safe(vm, n, &sys->vm_list, list) {
 				if (memcmp(vm->ent.host_addr, node->ent.addr,
 					   sizeof(node->ent.addr)) != 0)
@@ -1059,6 +1067,23 @@ static void __sd_confch(struct work *work, int idx)
 				if (vm->ent.host_port != node->ent.port)
 					continue;
 
+				if (w->nr_failed_vdis * sizeof(*w->failed_vdis) >= size) {
+					size *= 2;
+					buf = realloc(w->failed_vdis, size);
+					if (!buf) {
+						eprintf("out of memory, %d\n", size);
+						break;
+					}
+					w->failed_vdis = buf;
+				}
+
+				ret = lookup_vdi((char *)vm->ent.name,
+						 sizeof(vm->ent.name), &oid, 0);
+				if (ret == SD_RES_SUCCESS)
+					w->failed_vdis[w->nr_failed_vdis++] = oid_to_bit(oid);
+				else
+					eprintf("cannot find vdi %s\n", vm->ent.name);
+
 				list_del(&vm->list);
 				free(vm);
 			}
@@ -1144,12 +1169,13 @@ static void __sd_confch_done(struct work *work, int idx)
 	if (w->left_list_entries) {
 		if (w->left_list_entries > 1)
 			eprintf("we can't handle %Zd\n", w->left_list_entries);
-		start_recovery(sys->epoch);
+		start_recovery(sys->epoch, w->failed_vdis, w->nr_failed_vdis);
 	}
 
 	free(w->member_list);
 	free(w->left_list);
 	free(w->joined_list);
+	free(w->failed_vdis);
 	free(w);
 }
 
diff --git a/collie/store.c b/collie/store.c
index 0fa711e..d89433e 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -959,6 +959,9 @@ struct recovery_work {
 	struct work work;
 	struct list_head rw_siblings;
 
+	unsigned long *failed_vdis;
+	int nr_failed_vdis;
+
 	int count;
 	char *buf;
 };
@@ -1153,6 +1156,7 @@ static void recover_one(struct work *work, int idx)
 	int old_nr, cur_nr;
 	uint32_t epoch = rw->epoch;
 	int i, my_idx = -1, copy_idx, cur_idx = -1;
+	int is_failed_oid = 0;
 
 	eprintf("%d %d, %16lx\n", rw->done, rw->count, oid);
 
@@ -1180,22 +1184,30 @@ static void recover_one(struct work *work, int idx)
 
 	cur_idx = obj_to_sheep(cur_entry, cur_nr, oid, 0);
 
-	for (i = 0; i < cur_nr; i++) {
-		if (cur_entry[i].id == sys->this_node.id) {
-			my_idx = i;
-			break;
-		}
+	for (i = 0; i < rw->nr_failed_vdis; i++) {
+		if (rw->failed_vdis[i] == oid_to_bit(oid))
+			is_failed_oid = 1;
 	}
-	copy_idx = node_distance(my_idx, cur_idx, cur_nr);
-	dprintf("%d, %d, %d, %d\n", my_idx, cur_idx, cur_nr, copy_idx);
 
-	ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr, cur_idx,
-			    copy_idx, epoch, epoch - 1, oid, buf, SD_DATA_OBJ_SIZE);
-	if (ret == 0)
-		goto out;
+	if (!is_failed_oid) {
+		for (i = 0; i < cur_nr; i++) {
+			if (cur_entry[i].id == sys->this_node.id) {
+				my_idx = i;
+				break;
+			}
+		}
+		copy_idx = node_distance(my_idx, cur_idx, cur_nr);
+		dprintf("%d, %d, %d, %d\n", my_idx, cur_idx, cur_nr, copy_idx);
+
+		ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr,
+				    cur_idx, copy_idx, epoch, epoch - 1, oid,
+				    buf, SD_DATA_OBJ_SIZE);
+		if (ret == 0)
+			goto out;
+	}
 
 	for (i = 0; i < sys->nr_sobjs; i++) {
-		if (i == copy_idx)
+		if (!is_failed_oid && i == copy_idx)
 			continue;
 		ret = __recover_one(rw, old_entry, old_nr,
 				    cur_entry, cur_nr, cur_idx, i,
@@ -1243,6 +1255,7 @@ static void recover_one_done(struct work *work, int idx)
 	recovering = 0;
 
 	free(rw->buf);
+	free(rw->failed_vdis);
 	free(rw);
 
 	if (!list_empty(&recovery_work_list)) {
@@ -1460,6 +1473,7 @@ static void __start_recovery_done(struct work *work, int idx)
 	recovering = 0;
 
 	free(rw->buf);
+	free(rw->failed_vdis);
 	free(rw);
 
 	if (!list_empty(&recovery_work_list)) {
@@ -1473,7 +1487,7 @@ static void __start_recovery_done(struct work *work, int idx)
 	}
 }
 
-int start_recovery(uint32_t epoch)
+int start_recovery(uint32_t epoch, unsigned long *failed_vdis, int nr_failed_vdis)
 {
 	struct recovery_work *rw;
 
@@ -1485,6 +1499,16 @@ int start_recovery(uint32_t epoch)
 	rw->epoch = epoch;
 	rw->count = 0;
 
+	if (failed_vdis) {
+		rw->failed_vdis = malloc(nr_failed_vdis * sizeof(*failed_vdis));
+		if (!rw->failed_vdis) {
+			eprintf("out of memory\n");
+			goto fail;
+		}
+		memcpy(rw->failed_vdis, failed_vdis,
+		       nr_failed_vdis * sizeof(*failed_vdis));
+	}
+
 	rw->work.fn = __start_recovery;
 	rw->work.done = __start_recovery_done;
 
@@ -1496,6 +1520,11 @@ int start_recovery(uint32_t epoch)
 	}
 
 	return 0;
+fail:
+	free(rw->buf);
+	free(rw->failed_vdis);
+	free(rw);
+	return -1;
 }
 
 static int init_path(char *d, int *new)
-- 
1.5.6.5