[sheepdog-users] [PATCH 3/3] sheep: recover from local when target node has same content

MORITA Kazutaka morita.kazutaka at gmail.com
Fri May 10 07:28:33 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

This checks the hash value of the target object before reading the entire data,
and recovers from local if there is a same object in the stale directory.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/recovery.c |  134 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 83 insertions(+), 51 deletions(-)

diff --git a/sheep/recovery.c b/sheep/recovery.c
index 4c85a08..63c8386 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -47,6 +47,10 @@ struct recovery_obj_work {
 
 	uint64_t oid; /* the object to be recovered */
 	bool stop;
+
+	/* local replica in the stale directory */
+	uint32_t local_epoch;
+	uint8_t local_sha1[SHA1_LEN];
 };
 
 /*
@@ -99,29 +103,73 @@ static int obj_cmp(const void *oid1, const void *oid2)
 	return 0;
 }
 
-/*
- * A virtual node that does not match any node in current node list
- * means the node has left the cluster, then it's an invalid virtual node.
- */
-static bool is_invalid_vnode(const struct sd_vnode *entry,
-			     struct sd_node *nodes, int nr_nodes)
+/* recover object from vnode */
+static int recover_object_from(uint64_t oid, const struct sd_vnode *vnode,
+			       uint32_t epoch, uint32_t tgt_epoch,
+			       uint32_t local_epoch, uint8_t *sha1)
 {
-	if (bsearch(entry, nodes, nr_nodes, sizeof(struct sd_node),
-		    node_id_cmp))
-		return false;
-	return true;
+	int ret;
+	unsigned rlen;
+	void *buf = NULL;
+	struct sd_req hdr;
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+	struct siocb iocb = { 0 };
+
+	if (vnode_is_local(vnode)) {
+		if (tgt_epoch < sys_epoch())
+			return sd_store->link(oid, tgt_epoch);
+
+		return SD_RES_NO_OBJ;
+	}
+
+	/* compare sha1 hash value first */
+	if (local_epoch > 0) {
+		sd_init_req(&hdr, SD_OP_GET_HASH);
+		hdr.obj.oid = oid;
+		hdr.obj.tgt_epoch = tgt_epoch;
+		ret = sheep_exec_req(&vnode->nid, &hdr, NULL);
+		if (ret != SD_RES_SUCCESS)
+			return ret;
+
+		if (memcmp(rsp->hash.digest, sha1, sizeof(SHA1_LEN)) == 0) {
+			sd_dprintf("use local replica at epoch %d",
+				   local_epoch);
+			ret = sd_store->link(oid, local_epoch);
+			if (ret == SD_RES_SUCCESS)
+				return ret;
+		}
+	}
+
+	rlen = get_objsize(oid);
+	buf = xvalloc(rlen);
+
+	/* recover from remote replica */
+	sd_init_req(&hdr, SD_OP_READ_PEER);
+	hdr.epoch = epoch;
+	hdr.flags = SD_FLAG_CMD_RECOVERY;
+	hdr.data_length = rlen;
+	hdr.obj.oid = oid;
+	hdr.obj.tgt_epoch = tgt_epoch;
+
+	ret = sheep_exec_req(&vnode->nid, &hdr, buf);
+	if (ret == SD_RES_SUCCESS) {
+		iocb.epoch = epoch;
+		iocb.length = rsp->data_length;
+		iocb.offset = rsp->obj.offset;
+		iocb.buf = buf;
+		ret = sd_store->create_and_write(oid, &iocb);
+	}
+
+	free(buf);
+	return ret;
 }
 
 static int recover_object_from_replica(uint64_t oid, struct vnode_info *old,
 				       struct vnode_info *cur,
-				       uint32_t epoch, uint32_t tgt_epoch)
+				       uint32_t epoch, uint32_t tgt_epoch,
+				       uint32_t local_epoch, uint8_t *sha1)
 {
-	struct sd_req hdr;
-	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-	unsigned rlen;
 	int nr_copies, ret = SD_RES_SUCCESS, start = 0;
-	void *buf = NULL;
-	struct siocb iocb = { 0 };
 	bool fully_replicated = true;
 
 	nr_copies = get_obj_copy_number(oid, old->nr_zones);
@@ -138,9 +186,6 @@ static int recover_object_from_replica(uint64_t oid, struct vnode_info *old,
 		}
 	}
 
-	rlen = get_objsize(oid);
-	buf = xvalloc(rlen);
-
 	/* Let's do a breadth-first search */
 	for (int i = 0; i < nr_copies; i++) {
 		const struct sd_vnode *vnode;
@@ -148,41 +193,17 @@ static int recover_object_from_replica(uint64_t oid, struct vnode_info *old,
 
 		vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, idx);
 
-		if (is_invalid_vnode(vnode, cur->nodes, cur->nr_nodes))
-			continue;
-
-		if (vnode_is_local(vnode)) {
-			if (tgt_epoch < sys_epoch())
-				ret = sd_store->link(oid, tgt_epoch);
-			else
-				ret = SD_RES_NO_OBJ;
-		} else {
-			sd_init_req(&hdr, SD_OP_READ_PEER);
-			hdr.epoch = epoch;
-			hdr.flags = SD_FLAG_CMD_RECOVERY;
-			hdr.data_length = rlen;
-			hdr.obj.oid = oid;
-			hdr.obj.tgt_epoch = tgt_epoch;
-
-			ret = sheep_exec_req(&vnode->nid, &hdr, buf);
-			if (ret == SD_RES_SUCCESS) {
-				iocb.epoch = epoch;
-				iocb.length = rsp->data_length;
-				iocb.offset = rsp->obj.offset;
-				iocb.buf = buf;
-				ret = sd_store->create_and_write(oid, &iocb);
-			}
-		}
-
+		ret = recover_object_from(oid, vnode, epoch, tgt_epoch,
+					  local_epoch, sha1);
 		switch (ret) {
 		case SD_RES_SUCCESS:
 			sd_dprintf("recovered oid %"PRIx64" from %d "
 				   "to epoch %d", oid, tgt_epoch, epoch);
 			objlist_cache_insert(oid);
-			goto out;
+			return ret;
 		case SD_RES_OLD_NODE_VER:
 			/* move to the next epoch recovery */
-			goto out;
+			return ret;
 		case SD_RES_NO_OBJ:
 			fully_replicated = false;
 			/* fall through */
@@ -200,8 +221,6 @@ static int recover_object_from_replica(uint64_t oid, struct vnode_info *old,
 	if (fully_replicated && ret != SD_RES_SUCCESS)
 		ret = SD_RES_STALE_OBJ;
 
-out:
-	free(buf);
 	return ret;
 }
 
@@ -216,6 +235,8 @@ static int do_recover_object(struct recovery_obj_work *row)
 	struct vnode_info *old, *cur;
 	uint64_t oid = row->oid;
 	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch;
+	uint32_t local_epoch = row->local_epoch;
+	uint8_t *sha1 = row->local_sha1;
 	int ret;
 	struct vnode_info *new_old;
 
@@ -225,7 +246,8 @@ again:
 	sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid,
 		   tgt_epoch);
 
-	ret = recover_object_from_replica(oid, old, cur, epoch, tgt_epoch);
+	ret = recover_object_from_replica(oid, old, cur, epoch, tgt_epoch,
+					  local_epoch, sha1);
 
 	switch (ret) {
 	case SD_RES_SUCCESS:
@@ -276,13 +298,23 @@ static void recover_object_work(struct work *work)
 						     struct recovery_obj_work,
 						     base);
 	uint64_t oid = row->oid;
-	int ret;
+	int ret, epoch;
 
 	if (sd_store->exist(oid)) {
 		sd_dprintf("the object is already recovered");
 		return;
 	}
 
+	/* find object in the stale directory */
+	for (epoch = sys_epoch() - 1; epoch > 0; epoch--) {
+		ret = sd_store->get_hash(oid, epoch, row->local_sha1);
+		if (ret == SD_RES_SUCCESS) {
+			sd_dprintf("replica found in local at epoch %d", epoch);
+			row->local_epoch = epoch;
+			break;
+		}
+	}
+
 	ret = do_recover_object(row);
 	if (ret < 0)
 		sd_eprintf("failed to recover object %"PRIx64, oid);
-- 
1.7.9.5



More information about the sheepdog-users mailing list