[Sheepdog] [PATCH] collie: fix object consistency when qemu died during write operation
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Sun Apr 18 20:32:59 CEST 2010
If qemu dies during write object operation, consistency of the replicated
objects would be broken. This patch fixes the broken consistency in the
object recovery routines.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
collie/collie.h | 2 +-
collie/group.c | 30 ++++++++++++++++++++++++++++--
collie/store.c | 55 ++++++++++++++++++++++++++++++++++++++++++-------------
3 files changed, 71 insertions(+), 16 deletions(-)
diff --git a/collie/collie.h b/collie/collie.h
index 826ac3a..734b1a4 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -130,7 +130,7 @@ int remove_epoch(int epoch);
int set_cluster_ctime(uint64_t ctime);
uint64_t get_cluster_ctime(void);
-int start_recovery(uint32_t epoch);
+int start_recovery(uint32_t epoch, unsigned long *failed_vdis, int nr_failed_vdis);
int start_deletion(uint64_t oid);
static inline int is_myself(struct sheepdog_node_list_entry *e)
diff --git a/collie/group.c b/collie/group.c
index 1f5544a..d8ebad9 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -90,6 +90,9 @@ struct work_confch {
struct cpg_address *joined_list;
size_t joined_list_entries;
+ unsigned long *failed_vdis;
+ int nr_failed_vdis;
+
struct work work;
};
@@ -881,7 +884,7 @@ static void __sd_deliver_done(struct work *work, int idx)
*/
if (m->state == DM_FIN && m->op == SD_MSG_JOIN && sys->epoch >= 2)
- start_recovery(sys->epoch);
+ start_recovery(sys->epoch, NULL, 0);
free(w->msg);
free(w);
@@ -1051,7 +1054,12 @@ static void __sd_confch(struct work *work, int idx)
int nr;
struct sheepdog_node_list_entry e[SD_MAX_NODES];
struct vm *vm, *n;
+ int ret, size;
+ uint64_t oid;
+ void *buf;
+ size = sizeof(*w->failed_vdis) * 64;
+ w->failed_vdis = malloc(size);
list_for_each_entry_safe(vm, n, &sys->vm_list, list) {
if (memcmp(vm->ent.host_addr, node->ent.addr,
sizeof(node->ent.addr)) != 0)
@@ -1059,6 +1067,23 @@ static void __sd_confch(struct work *work, int idx)
if (vm->ent.host_port != node->ent.port)
continue;
+ if (w->nr_failed_vdis * sizeof(*w->failed_vdis) >= size) {
+ size *= 2;
+ buf = realloc(w->failed_vdis, size);
+ if (!buf) {
+ eprintf("out of memory, %d\n", size);
+ break;
+ }
+ w->failed_vdis = buf;
+ }
+
+ ret = lookup_vdi((char *)vm->ent.name,
+ sizeof(vm->ent.name), &oid, 0);
+ if (ret == SD_RES_SUCCESS)
+ w->failed_vdis[w->nr_failed_vdis++] = oid_to_bit(oid);
+ else
+ eprintf("cannot find vdi %s\n", vm->ent.name);
+
list_del(&vm->list);
free(vm);
}
@@ -1144,12 +1169,13 @@ static void __sd_confch_done(struct work *work, int idx)
if (w->left_list_entries) {
if (w->left_list_entries > 1)
eprintf("we can't handle %Zd\n", w->left_list_entries);
- start_recovery(sys->epoch);
+ start_recovery(sys->epoch, w->failed_vdis, w->nr_failed_vdis);
}
free(w->member_list);
free(w->left_list);
free(w->joined_list);
+ free(w->failed_vdis);
free(w);
}
diff --git a/collie/store.c b/collie/store.c
index 0fa711e..d89433e 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -959,6 +959,9 @@ struct recovery_work {
struct work work;
struct list_head rw_siblings;
+ unsigned long *failed_vdis;
+ int nr_failed_vdis;
+
int count;
char *buf;
};
@@ -1153,6 +1156,7 @@ static void recover_one(struct work *work, int idx)
int old_nr, cur_nr;
uint32_t epoch = rw->epoch;
int i, my_idx = -1, copy_idx, cur_idx = -1;
+ int is_failed_oid = 0;
eprintf("%d %d, %16lx\n", rw->done, rw->count, oid);
@@ -1180,22 +1184,30 @@ static void recover_one(struct work *work, int idx)
cur_idx = obj_to_sheep(cur_entry, cur_nr, oid, 0);
- for (i = 0; i < cur_nr; i++) {
- if (cur_entry[i].id == sys->this_node.id) {
- my_idx = i;
- break;
- }
+ for (i = 0; i < rw->nr_failed_vdis; i++) {
+ if (rw->failed_vdis[i] == oid_to_bit(oid))
+ is_failed_oid = 1;
}
- copy_idx = node_distance(my_idx, cur_idx, cur_nr);
- dprintf("%d, %d, %d, %d\n", my_idx, cur_idx, cur_nr, copy_idx);
- ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr, cur_idx,
- copy_idx, epoch, epoch - 1, oid, buf, SD_DATA_OBJ_SIZE);
- if (ret == 0)
- goto out;
+ if (!is_failed_oid) {
+ for (i = 0; i < cur_nr; i++) {
+ if (cur_entry[i].id == sys->this_node.id) {
+ my_idx = i;
+ break;
+ }
+ }
+ copy_idx = node_distance(my_idx, cur_idx, cur_nr);
+ dprintf("%d, %d, %d, %d\n", my_idx, cur_idx, cur_nr, copy_idx);
+
+ ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr,
+ cur_idx, copy_idx, epoch, epoch - 1, oid,
+ buf, SD_DATA_OBJ_SIZE);
+ if (ret == 0)
+ goto out;
+ }
for (i = 0; i < sys->nr_sobjs; i++) {
- if (i == copy_idx)
+ if (!is_failed_oid && i == copy_idx)
continue;
ret = __recover_one(rw, old_entry, old_nr,
cur_entry, cur_nr, cur_idx, i,
@@ -1243,6 +1255,7 @@ static void recover_one_done(struct work *work, int idx)
recovering = 0;
free(rw->buf);
+ free(rw->failed_vdis);
free(rw);
if (!list_empty(&recovery_work_list)) {
@@ -1460,6 +1473,7 @@ static void __start_recovery_done(struct work *work, int idx)
recovering = 0;
free(rw->buf);
+ free(rw->failed_vdis);
free(rw);
if (!list_empty(&recovery_work_list)) {
@@ -1473,7 +1487,7 @@ static void __start_recovery_done(struct work *work, int idx)
}
}
-int start_recovery(uint32_t epoch)
+int start_recovery(uint32_t epoch, unsigned long *failed_vdis, int nr_failed_vdis)
{
struct recovery_work *rw;
@@ -1485,6 +1499,16 @@ int start_recovery(uint32_t epoch)
rw->epoch = epoch;
rw->count = 0;
+ if (failed_vdis) {
+ rw->failed_vdis = malloc(nr_failed_vdis * sizeof(*failed_vdis));
+ if (!rw->failed_vdis) {
+ eprintf("out of memory\n");
+ goto fail;
+ }
+ memcpy(rw->failed_vdis, failed_vdis,
+ nr_failed_vdis * sizeof(*failed_vdis));
+ }
+
rw->work.fn = __start_recovery;
rw->work.done = __start_recovery_done;
@@ -1496,6 +1520,11 @@ int start_recovery(uint32_t epoch)
}
return 0;
+fail:
+ free(rw->buf);
+ free(rw->failed_vdis);
+ free(rw);
+ return -1;
}
static int init_path(char *d, int *new)
--
1.5.6.5
More information about the sheepdog
mailing list