[sheepdog] [PATCH v2] make recovery not to retry when recover_object_from_replica() fail

levin li levin108 at gmail.com
Tue May 29 04:53:25 CEST 2012


From: levin li <xingke.lwp at taobao.com>

rebase to latest master branch
------------------------------------------------------------ >8
Since we make sheep to wait to retry when epoch is inconsistent,
recover_object_from_replica() will never get a response with
SD_RES_NEW_NODE_VER, because the peer node will retry the request
itself locally until epoch gets consistent.

If epoch of request sender is old than the receiver, it would get
SD_RES_OLD_NODE_VER, in this case, it means the epoch it's to increment
and soon a new recovery work would replace the current one, we should
not waste time recovering for the out-of-date recovery work, what we
should do is to make the current recovery work cease to wait for replacement.

As for SD_RES_NETWORK_ERROR, currently, recover_object_from_replica() will
get SD_RES_NEWWORK_ERROR only if there's an EIO when reading the object,
in this case we should not make recovery retry, because next time it may
get an EIO either and so that make the recovery work hang there retrying
constantly, we should make it retry another copies or in another epoch.

Signed-off-by: levin li <xingke.lwp at taobao.com>
---
 sheep/recovery.c |   67 +++++++++++++++++++-----------------------------------
 1 file changed, 23 insertions(+), 44 deletions(-)

diff --git a/sheep/recovery.c b/sheep/recovery.c
index d5408e3..5df8d85 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -30,7 +30,7 @@ struct recovery_work {
 	uint32_t done;
 
 	struct timer timer;
-	int retry;
+	int cease;
 	struct work work;
 
 	int nr_blocking;
@@ -155,11 +155,9 @@ static int recover_object_from_replica(uint64_t oid,
 			ret = -1;
 			goto out;
 		}
-	} else if (rsp->result == SD_RES_NEW_NODE_VER ||
-			rsp->result == SD_RES_OLD_NODE_VER ||
-			rsp->result == SD_RES_NETWORK_ERROR) {
-		dprintf("retrying: %"PRIx32", %"PRIx64"\n", rsp->result, oid);
-		ret = 1;
+	} else if (rsp->result == SD_RES_OLD_NODE_VER) {
+		eprintf("ceasing, res: %"PRIx32"\n", rsp->result);
+		ret = rsp->result;
 		goto out;
 	} else {
 		eprintf("failed, res: %"PRIx32"\n", rsp->result);
@@ -198,7 +196,7 @@ static int do_recover_object(struct recovery_work *rw)
 	struct vnode_info *old;
 	uint64_t oid = rw->oids[rw->done];
 	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1;
-	int nr_copies, ret, i, retry;
+	int nr_copies, ret, i;
 
 	old = grab_vnode_info(rw->old_vnodes);
 
@@ -207,7 +205,6 @@ again:
 		oid, tgt_epoch);
 
 	/* Let's do a breadth-first search */
-	retry = 0;
 	nr_copies = get_nr_copies(old);
 	for (i = 0; i < nr_copies; i++) {
 		struct sd_vnode *tgt_vnode = oid_to_vnode(old, oid, i);
@@ -220,20 +217,12 @@ again:
 		if (ret == 0) {
 			/* Succeed */
 			break;
-		} else if (ret > 0) {
-			retry = 1;
-			/* Try our best to recover from peers */
-			continue;
+		} else if (SD_RES_OLD_NODE_VER == ret) {
+			rw->cease = 1;
+			goto err;
 		}
 	}
 
-	/* If not succeed but someone orders us to retry it, serve the order */
-	if (ret != 0 && retry == 1) {
-		ret = 0;
-		rw->retry = 1;
-		goto err;
-	}
-
 	/* No luck, roll back to an older configuration and try again */
 	if (ret < 0) {
 		struct vnode_info *new_old;
@@ -285,19 +274,6 @@ static void recover_object(struct work *work)
 
 static struct recovery_work *suspended_recovery_work;
 
-static void recover_timer(void *data)
-{
-	struct recovery_work *rw = (struct recovery_work *)data;
-	uint64_t oid = rw->oids[rw->done];
-
-	if (is_access_to_busy_objects(oid)) {
-		suspended_recovery_work = rw;
-		return;
-	}
-
-	queue_work(sys->recovery_wqueue, &rw->work);
-}
-
 void resume_recovery_work(void)
 {
 	struct recovery_work *rw;
@@ -420,7 +396,7 @@ static void do_recover_main(struct work *work)
 		rw->state = RW_RUN;
 		recovered_oid = 0;
 		resume_wait_recovery_requests();
-	} else if (!rw->retry) {
+	} else if (!rw->cease){
 		rw->done++;
 		if (rw->nr_blocking > 0)
 			rw->nr_blocking--;
@@ -431,18 +407,14 @@ static void do_recover_main(struct work *work)
 	if (recovered_oid)
 		resume_wait_obj_requests(recovered_oid);
 
-	if (rw->retry && !next_rw) {
-		rw->retry = 0;
-
-		rw->timer.callback = recover_timer;
-		rw->timer.data = rw;
-		add_timer(&rw->timer, 2);
-		return;
-	}
-
 	if (rw->done < rw->count && !next_rw) {
 		rw->work.fn = recover_object;
 
+		if (rw->cease) {
+			flush_wait_obj_requests();
+			return;
+		}
+
 		if (is_access_to_busy_objects(oid)) {
 			suspended_recovery_work = rw;
 			return;
@@ -465,6 +437,7 @@ static void do_recover_main(struct work *work)
 		flush_wait_obj_requests();
 
 		recovering_work = rw;
+
 		queue_work(sys->recovery_wqueue, &rw->work);
 	} else {
 		if (sd_store->end_recover) {
@@ -576,9 +549,11 @@ static int newly_joined(struct sd_node *node, struct recovery_work *rw)
 	return 1;
 }
 
+#define MAX_RETRY 5
+
 static int fill_obj_list(struct recovery_work *rw)
 {
-	int i;
+	int i, retry = 0;
 	uint8_t *buf = NULL;
 	size_t buf_size = SD_DATA_OBJ_SIZE; /* FIXME */
 	int retry_cnt;
@@ -587,10 +562,14 @@ static int fill_obj_list(struct recovery_work *rw)
 	int start = random() % cur_nr;
 	int end = cur_nr;
 
+alloc:
 	buf = malloc(buf_size);
 	if (!buf) {
 		eprintf("out of memory\n");
-		rw->retry = 1;
+		if (retry++ < MAX_RETRY) {
+			sleep(1);
+			goto alloc;
+		}
 		return -1;
 	}
 
-- 
1.7.10




More information about the sheepdog mailing list