[sheepdog] [PATCH v2 01/10] sheep: do breadth first search in recover_object_from_replica()

MORITA Kazutaka morita.kazutaka at gmail.com
Thu May 9 04:38:50 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

This prepares for the succeeding patches.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/recovery.c |  150 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 79 insertions(+), 71 deletions(-)

diff --git a/sheep/recovery.c b/sheep/recovery.c
index 4b3455a..1429391 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -67,117 +67,123 @@ static int obj_cmp(const void *oid1, const void *oid2)
 	return 0;
 }
 
-static int recover_object_from_replica(uint64_t oid,
-				       const struct sd_vnode *vnode,
+/*
+ * A virtual node that does not match any node in current node list
+ * means the node has left the cluster, then it's an invalid virtual node.
+ */
+static bool is_invalid_vnode(const struct sd_vnode *entry,
+			     struct sd_node *nodes, int nr_nodes)
+{
+	if (bsearch(entry, nodes, nr_nodes, sizeof(struct sd_node),
+		    node_id_cmp))
+		return false;
+	return true;
+}
+
+static int recover_object_from_replica(uint64_t oid, struct vnode_info *old,
+				       struct vnode_info *cur,
 				       uint32_t epoch, uint32_t tgt_epoch)
 {
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	unsigned rlen;
-	int ret = SD_RES_NO_MEM;
+	int nr_copies, ret;
 	void *buf = NULL;
 	struct siocb iocb = { 0 };
 
-	if (vnode_is_local(vnode) && tgt_epoch < sys_epoch()) {
-		ret = sd_store->link(oid, tgt_epoch);
-		goto out;
-	}
-
 	rlen = get_objsize(oid);
 	buf = valloc(rlen);
 	if (!buf) {
+		ret = SD_RES_NO_MEM;
 		sd_eprintf("%m");
 		goto out;
 	}
 
-	sd_init_req(&hdr, SD_OP_READ_PEER);
-	hdr.epoch = epoch;
-	hdr.flags = SD_FLAG_CMD_RECOVERY;
-	hdr.data_length = rlen;
-	hdr.obj.oid = oid;
-	hdr.obj.tgt_epoch = tgt_epoch;
+	/* Let's do a breadth-first search */
+	nr_copies = get_obj_copy_number(oid, old->nr_zones);
+	for (int i = 0; i < nr_copies; i++) {
+		const struct sd_vnode *vnode;
 
-	ret = sheep_exec_req(&vnode->nid, &hdr, buf);
-	if (ret != SD_RES_SUCCESS)
-		goto out;
-	iocb.epoch = epoch;
-	iocb.length = rsp->data_length;
-	iocb.offset = rsp->obj.offset;
-	iocb.buf = buf;
-	ret = sd_store->create_and_write(oid, &iocb);
-out:
-	if (ret == SD_RES_SUCCESS) {
-		sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid,
-			tgt_epoch, epoch);
-		objlist_cache_insert(oid);
+		vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);
+
+		if (is_invalid_vnode(vnode, cur->nodes, cur->nr_nodes))
+			continue;
+
+		if (vnode_is_local(vnode) && tgt_epoch < sys_epoch()) {
+			ret = sd_store->link(oid, tgt_epoch);
+			goto out;
+		}
+
+		sd_init_req(&hdr, SD_OP_READ_PEER);
+		hdr.epoch = epoch;
+		hdr.flags = SD_FLAG_CMD_RECOVERY;
+		hdr.data_length = rlen;
+		hdr.obj.oid = oid;
+		hdr.obj.tgt_epoch = tgt_epoch;
+
+		ret = sheep_exec_req(&vnode->nid, &hdr, buf);
+		if (ret == SD_RES_SUCCESS) {
+			iocb.epoch = epoch;
+			iocb.length = rsp->data_length;
+			iocb.offset = rsp->obj.offset;
+			iocb.buf = buf;
+			ret = sd_store->create_and_write(oid, &iocb);
+		}
+
+		switch (ret) {
+		case SD_RES_SUCCESS:
+			sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid,
+				   tgt_epoch, epoch);
+			objlist_cache_insert(oid);
+			goto out;
+		case SD_RES_OLD_NODE_VER:
+			/* move to the next epoch recovery */
+			goto out;
+		default:
+			break;
+		}
 	}
+out:
 	free(buf);
 	return ret;
 }
 
 /*
- * A virtual node that does not match any node in current node list
- * means the node has left the cluster, then it's an invalid virtual node.
- */
-static bool is_invalid_vnode(const struct sd_vnode *entry,
-			     struct sd_node *nodes, int nr_nodes)
-{
-	if (bsearch(entry, nodes, nr_nodes, sizeof(struct sd_node),
-		    node_id_cmp))
-		return false;
-	return true;
-}
-
-/*
  * Recover the object from its track in epoch history. That is,
  * the routine will try to recovery it from the nodes it has stayed,
  * at least, *theoretically* on consistent hash ring.
  */
 static int do_recover_object(struct recovery_work *rw)
 {
-	struct vnode_info *old;
+	struct vnode_info *old, *cur;
 	uint64_t oid = rw->oids[rw->done];
 	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch;
-	int nr_copies, ret, i;
+	int ret;
+	struct vnode_info *new_old;
 
 	old = grab_vnode_info(rw->old_vinfo);
-
+	cur = grab_vnode_info(rw->cur_vinfo);
 again:
 	sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid,
 		   tgt_epoch);
 
-	/* Let's do a breadth-first search */
-	nr_copies = get_obj_copy_number(oid, old->nr_zones);
-	for (i = 0; i < nr_copies; i++) {
-		const struct sd_vnode *tgt_vnode;
-
-		tgt_vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);
-
-		if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes,
-				     rw->cur_vinfo->nr_nodes))
-			continue;
-		ret = recover_object_from_replica(oid, tgt_vnode,
-						  epoch, tgt_epoch);
-		if (ret == SD_RES_SUCCESS) {
-			/* Succeed */
-			break;
-		} else if (SD_RES_OLD_NODE_VER == ret) {
-			rw->stop = true;
-			goto err;
-		} else
-			ret = -1;
-	}
-
-	/* No luck, roll back to an older configuration and try again */
-	if (ret < 0) {
-		struct vnode_info *new_old;
+	ret = recover_object_from_replica(oid, old, cur, epoch, tgt_epoch);
 
+	switch (ret) {
+	case SD_RES_SUCCESS:
+		/* Succeed */
+		break;
+	case SD_RES_OLD_NODE_VER:
+		rw->stop = true;
+		break;
+	default:
+		/* No luck, roll back to an older configuration and try again */
 rollback:
 		tgt_epoch--;
 		if (tgt_epoch < 1) {
 			sd_eprintf("can not recover oid %"PRIx64, oid);
 			ret = -1;
-			goto err;
+			break;
 		}
 
 		new_old = get_vnode_info_epoch(tgt_epoch, rw->cur_vinfo);
@@ -185,12 +191,14 @@ rollback:
 			/* We rollback in case we don't get a valid epoch */
 			goto rollback;
 
-		put_vnode_info(old);
+		put_vnode_info(cur);
+		cur = old;
 		old = new_old;
 		goto again;
 	}
-err:
+
 	put_vnode_info(old);
+	put_vnode_info(cur);
 	return ret;
 }
 
-- 
1.7.9.5




More information about the sheepdog mailing list