[sheepdog] [PATCH v2 01/10] sheep: do breadth first search in recover_object_from_replica()
MORITA Kazutaka
morita.kazutaka at gmail.com
Thu May 9 04:38:50 CEST 2013
From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
This prepares for the succeeding patches.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
sheep/recovery.c | 150 ++++++++++++++++++++++++++++--------------------------
1 file changed, 79 insertions(+), 71 deletions(-)
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 4b3455a..1429391 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -67,117 +67,123 @@ static int obj_cmp(const void *oid1, const void *oid2)
return 0;
}
-static int recover_object_from_replica(uint64_t oid,
- const struct sd_vnode *vnode,
+/*
+ * A virtual node that does not match any node in current node list
+ * means the node has left the cluster, then it's an invalid virtual node.
+ */
+static bool is_invalid_vnode(const struct sd_vnode *entry,
+ struct sd_node *nodes, int nr_nodes)
+{
+ if (bsearch(entry, nodes, nr_nodes, sizeof(struct sd_node),
+ node_id_cmp))
+ return false;
+ return true;
+}
+
+static int recover_object_from_replica(uint64_t oid, struct vnode_info *old,
+ struct vnode_info *cur,
uint32_t epoch, uint32_t tgt_epoch)
{
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
unsigned rlen;
- int ret = SD_RES_NO_MEM;
+ int nr_copies, ret;
void *buf = NULL;
struct siocb iocb = { 0 };
- if (vnode_is_local(vnode) && tgt_epoch < sys_epoch()) {
- ret = sd_store->link(oid, tgt_epoch);
- goto out;
- }
-
rlen = get_objsize(oid);
buf = valloc(rlen);
if (!buf) {
+ ret = SD_RES_NO_MEM;
sd_eprintf("%m");
goto out;
}
- sd_init_req(&hdr, SD_OP_READ_PEER);
- hdr.epoch = epoch;
- hdr.flags = SD_FLAG_CMD_RECOVERY;
- hdr.data_length = rlen;
- hdr.obj.oid = oid;
- hdr.obj.tgt_epoch = tgt_epoch;
+ /* Let's do a breadth-first search */
+ nr_copies = get_obj_copy_number(oid, old->nr_zones);
+ for (int i = 0; i < nr_copies; i++) {
+ const struct sd_vnode *vnode;
- ret = sheep_exec_req(&vnode->nid, &hdr, buf);
- if (ret != SD_RES_SUCCESS)
- goto out;
- iocb.epoch = epoch;
- iocb.length = rsp->data_length;
- iocb.offset = rsp->obj.offset;
- iocb.buf = buf;
- ret = sd_store->create_and_write(oid, &iocb);
-out:
- if (ret == SD_RES_SUCCESS) {
- sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid,
- tgt_epoch, epoch);
- objlist_cache_insert(oid);
+ vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);
+
+ if (is_invalid_vnode(vnode, cur->nodes, cur->nr_nodes))
+ continue;
+
+ if (vnode_is_local(vnode) && tgt_epoch < sys_epoch()) {
+ ret = sd_store->link(oid, tgt_epoch);
+ goto out;
+ }
+
+ sd_init_req(&hdr, SD_OP_READ_PEER);
+ hdr.epoch = epoch;
+ hdr.flags = SD_FLAG_CMD_RECOVERY;
+ hdr.data_length = rlen;
+ hdr.obj.oid = oid;
+ hdr.obj.tgt_epoch = tgt_epoch;
+
+ ret = sheep_exec_req(&vnode->nid, &hdr, buf);
+ if (ret == SD_RES_SUCCESS) {
+ iocb.epoch = epoch;
+ iocb.length = rsp->data_length;
+ iocb.offset = rsp->obj.offset;
+ iocb.buf = buf;
+ ret = sd_store->create_and_write(oid, &iocb);
+ }
+
+ switch (ret) {
+ case SD_RES_SUCCESS:
+ sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid,
+ tgt_epoch, epoch);
+ objlist_cache_insert(oid);
+ goto out;
+ case SD_RES_OLD_NODE_VER:
+ /* move to the next epoch recovery */
+ goto out;
+ default:
+ break;
+ }
}
+out:
free(buf);
return ret;
}
/*
- * A virtual node that does not match any node in current node list
- * means the node has left the cluster, then it's an invalid virtual node.
- */
-static bool is_invalid_vnode(const struct sd_vnode *entry,
- struct sd_node *nodes, int nr_nodes)
-{
- if (bsearch(entry, nodes, nr_nodes, sizeof(struct sd_node),
- node_id_cmp))
- return false;
- return true;
-}
-
-/*
* Recover the object from its track in epoch history. That is,
* the routine will try to recovery it from the nodes it has stayed,
* at least, *theoretically* on consistent hash ring.
*/
static int do_recover_object(struct recovery_work *rw)
{
- struct vnode_info *old;
+ struct vnode_info *old, *cur;
uint64_t oid = rw->oids[rw->done];
uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch;
- int nr_copies, ret, i;
+ int ret;
+ struct vnode_info *new_old;
old = grab_vnode_info(rw->old_vinfo);
-
+ cur = grab_vnode_info(rw->cur_vinfo);
again:
sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid,
tgt_epoch);
- /* Let's do a breadth-first search */
- nr_copies = get_obj_copy_number(oid, old->nr_zones);
- for (i = 0; i < nr_copies; i++) {
- const struct sd_vnode *tgt_vnode;
-
- tgt_vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);
-
- if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes,
- rw->cur_vinfo->nr_nodes))
- continue;
- ret = recover_object_from_replica(oid, tgt_vnode,
- epoch, tgt_epoch);
- if (ret == SD_RES_SUCCESS) {
- /* Succeed */
- break;
- } else if (SD_RES_OLD_NODE_VER == ret) {
- rw->stop = true;
- goto err;
- } else
- ret = -1;
- }
-
- /* No luck, roll back to an older configuration and try again */
- if (ret < 0) {
- struct vnode_info *new_old;
+ ret = recover_object_from_replica(oid, old, cur, epoch, tgt_epoch);
+ switch (ret) {
+ case SD_RES_SUCCESS:
+ /* Succeed */
+ break;
+ case SD_RES_OLD_NODE_VER:
+ rw->stop = true;
+ break;
+ default:
+ /* No luck, roll back to an older configuration and try again */
rollback:
tgt_epoch--;
if (tgt_epoch < 1) {
sd_eprintf("can not recover oid %"PRIx64, oid);
ret = -1;
- goto err;
+ break;
}
new_old = get_vnode_info_epoch(tgt_epoch, rw->cur_vinfo);
@@ -185,12 +191,14 @@ rollback:
/* We rollback in case we don't get a valid epoch */
goto rollback;
- put_vnode_info(old);
+ put_vnode_info(cur);
+ cur = old;
old = new_old;
goto again;
}
-err:
+
put_vnode_info(old);
+ put_vnode_info(cur);
return ret;
}
--
1.7.9.5
More information about the sheepdog
mailing list