Use struct vnode_info to hold vnode lists in recovery instead of opencoding the data structures. This helps to keep the code simpler, especially in do_recover_object which did mind-boggling copy games before. Signed-off-by: Christoph Hellwig <hch at lst.de> --- sheep/recovery.c | 138 ++++++++++++++++++++++--------------------------------- 1 file changed, 56 insertions(+), 82 deletions(-) Index: sheepdog/sheep/recovery.c =================================================================== --- sheepdog.orig/sheep/recovery.c 2012-05-28 14:40:21.440682850 +0200 +++ sheepdog/sheep/recovery.c 2012-05-28 14:43:05.072687040 +0200 @@ -41,10 +41,9 @@ struct recovery_work { struct sd_node old_nodes[SD_MAX_NODES]; int cur_nr_nodes; struct sd_node cur_nodes[SD_MAX_NODES]; - int old_nr_vnodes; - struct sd_vnode old_vnodes[SD_MAX_VNODES]; - int cur_nr_vnodes; - struct sd_vnode cur_vnodes[SD_MAX_VNODES]; + + struct vnode_info *old_vnodes; + struct vnode_info *cur_vnodes; }; static struct recovery_work *next_rw; @@ -62,25 +61,20 @@ static int obj_cmp(const void *oid1, con return 0; } -static void *get_vnodes_from_epoch(uint32_t epoch, int *nr, int *copies) +static struct vnode_info *get_vnodes_from_epoch(uint32_t epoch) { - int nodes_nr, len = sizeof(struct sd_vnode) * SD_MAX_VNODES; struct sd_node nodes[SD_MAX_NODES]; - void *buf = xmalloc(len); + int nr_nodes; - nodes_nr = epoch_log_read_nr(epoch, (void *)nodes, sizeof(nodes)); - if (nodes_nr < 0) { - nodes_nr = epoch_log_read_remote(epoch, (void *)nodes, sizeof(nodes)); - if (nodes_nr == 0) { - free(buf); + nr_nodes = epoch_log_read_nr(epoch, (void *)nodes, sizeof(nodes)); + if (nr_nodes < 0) { + nr_nodes = epoch_log_read_remote(epoch, (void *)nodes, sizeof(nodes)); + if (nr_nodes == 0) return NULL; - } - nodes_nr /= sizeof(nodes[0]); + nr_nodes /= sizeof(nodes[0]); } - *nr = nodes_to_vnodes(nodes, nodes_nr, buf); - *copies = get_max_nr_copies_from(nodes, nodes_nr); - return buf; + return alloc_vnode_info(nodes, nr_nodes); } static int recover_object_from_replica(uint64_t oid, @@ -180,22 +174,6 @@ out: return ret; } -static void rollback_old_cur(struct sd_vnode *old, int *old_nr, int *old_copies, - struct sd_vnode *cur, int *cur_nr, int *cur_copies, - struct sd_vnode *new_old, int new_old_nr, int new_old_copies) -{ - int nr_old = *old_nr; - int copies_old = *old_copies; - - memcpy(cur, old, sizeof(*old) * nr_old); - *cur_nr = nr_old; - *cur_copies = copies_old; - memcpy(old, new_old, sizeof(*new_old) * new_old_nr); - *old_nr = new_old_nr; - *old_copies = new_old_copies; -} - - /* * A virtual node that does not match any node in current node list * means the node has left the cluster, then it's an invalid virtual node. @@ -216,33 +194,28 @@ static int is_invalid_vnode(struct sd_vn */ static int do_recover_object(struct recovery_work *rw) { - struct sd_vnode *old, *cur; + struct vnode_info *old, *cur; uint64_t oid = rw->oids[rw->done]; - int old_nr = rw->old_nr_vnodes, cur_nr = rw->cur_nr_vnodes; uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1; - struct sd_vnode *tgt_entry; - int old_copies, cur_copies, ret, i, retry; + int nr_copies, ret, i, retry; - old = xmalloc(sizeof(*old) * SD_MAX_VNODES); - cur = xmalloc(sizeof(*cur) * SD_MAX_VNODES); - memcpy(old, rw->old_vnodes, sizeof(*old) * old_nr); - memcpy(cur, rw->cur_vnodes, sizeof(*cur) * cur_nr); - old_copies = get_max_nr_copies_from(rw->old_nodes, rw->old_nr_nodes); - cur_copies = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes); + old = grab_vnode_info(rw->old_vnodes); + cur = grab_vnode_info(rw->cur_vnodes); again: dprintf("try recover object %"PRIx64" from epoch %"PRIu32"\n", oid, tgt_epoch); - retry = 0; + /* Let's do a breadth-first search */ - for (i = 0; i < old_copies; i++) { - int idx; - idx = obj_to_sheep(old, old_nr, oid, i); - tgt_entry = old + idx; - if (is_invalid_vnode(tgt_entry, rw->cur_nodes, + retry = 0; + nr_copies = get_nr_copies(old); + for (i = 0; i < nr_copies; i++) { + struct sd_vnode *tgt_vnode = oid_to_vnode(old, oid, i); + + if (is_invalid_vnode(tgt_vnode, rw->cur_nodes, rw->cur_nr_nodes)) continue; - ret = recover_object_from_replica(oid, tgt_entry, + ret = recover_object_from_replica(oid, tgt_vnode, epoch, tgt_epoch); if (ret == 0) { /* Succeed */ @@ -253,16 +226,17 @@ again: continue; } } + /* If not succeed but someone orders us to retry it, serve the order */ if (ret != 0 && retry == 1) { ret = 0; rw->retry = 1; goto err; } + /* No luck, roll back to an older configuration and try again */ if (ret < 0) { - struct sd_vnode *new_old; - int new_old_nr, new_old_copies; + struct vnode_info *new_old; tgt_epoch--; if (tgt_epoch < 1) { @@ -271,19 +245,20 @@ again: goto err; } - new_old = get_vnodes_from_epoch(tgt_epoch, &new_old_nr, &new_old_copies); + new_old = get_vnodes_from_epoch(tgt_epoch); if (!new_old) { ret = -1; goto err; } - rollback_old_cur(old, &old_nr, &old_copies, cur, &cur_nr, &cur_copies, - new_old, new_old_nr, new_old_copies); - free(new_old); + + put_vnode_info(cur); + cur = old; + old = new_old; goto again; } err: - free(old); - free(cur); + put_vnode_info(old); + put_vnode_info(cur); return ret; } @@ -432,6 +407,14 @@ static void flush_wait_obj_requests(void process_request_event_queues(); } +static void free_recovery_work(struct recovery_work *rw) +{ + put_vnode_info(rw->cur_vnodes); + put_vnode_info(rw->old_vnodes); + free(rw->oids); + free(rw); +} + static void do_recover_main(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); @@ -477,9 +460,7 @@ static void do_recover_main(struct work recovering_work = NULL; sys->recovered_epoch = rw->epoch; - - free(rw->oids); - free(rw); + free_recovery_work(rw); if (next_rw) { rw = next_rw; @@ -563,23 +544,20 @@ int merge_objlist(uint64_t *list1, int n static int screen_obj_list(struct recovery_work *rw, uint64_t *list, int list_nr) { - int ret, i, cp, idx; + int ret, i, j; struct strbuf buf = STRBUF_INIT; - struct sd_vnode *nodes = rw->cur_vnodes; - int nodes_nr = rw->cur_nr_vnodes; - int nr_objs = get_max_nr_copies_from(rw->cur_nodes, rw->cur_nr_nodes); - int idx_buf[SD_MAX_COPIES]; + struct sd_vnode *vnodes[SD_MAX_COPIES]; + int nr_objs; + nr_objs = get_nr_copies(rw->cur_vnodes); for (i = 0; i < list_nr; i++) { - obj_to_sheeps(nodes, nodes_nr, list[i], nr_objs, idx_buf); - for (cp = 0; cp < nr_objs; cp++) { - idx = idx_buf[cp]; - if (vnode_is_local(&nodes[idx])) + oid_to_vnodes(rw->cur_vnodes, list[i], nr_objs, vnodes); + for (j = 0; j < nr_objs; j++) { + if (vnode_is_local(vnodes[j])) { + strbuf_add(&buf, &list[i], sizeof(uint64_t)); break; + } } - if (cp == nr_objs) - continue; - strbuf_add(&buf, &list[i], sizeof(uint64_t)); } memcpy(list, buf.buf, buf.len); @@ -689,11 +667,9 @@ static int init_rw(struct recovery_work eprintf("failed to read epoch log for epoch %"PRIu32"\n", epoch - 1); return -1; } - rw->old_nr_vnodes = nodes_to_vnodes(rw->old_nodes, rw->old_nr_nodes, - rw->old_vnodes); - rw->cur_nr_vnodes = nodes_to_vnodes(rw->cur_nodes, rw->cur_nr_nodes, - rw->cur_vnodes); + rw->old_vnodes = alloc_vnode_info(rw->old_nodes, rw->old_nr_nodes); + rw->cur_vnodes = alloc_vnode_info(rw->cur_nodes, rw->cur_nr_nodes); return 0; } @@ -739,11 +715,9 @@ int start_recovery(uint32_t epoch) } if (recovering_work != NULL) { - if (next_rw) { - /* skip the previous epoch recovery */ - free(next_rw->oids); - free(next_rw); - } + /* skip the previous epoch recovery */ + if (next_rw) + free_recovery_work(next_rw); next_rw = rw; } else { recovering_work = rw; |