[sheepdog] [PATCH] make recovery not to retry when recover_object_from_replica() fail

Mon May 28 11:22:52 CEST 2012

From: levin li <xingke.lwp at taobao.com>

Since we make sheep to wait to retry when epoch is inconsistent,
recover_object_from_replica() will never get a response with
SD_RES_NEW_NODE_VER, because the peer node will retry the request
itself locally until epoch gets consistent.

If epoch of request sender is old than the receiver, it would get
SD_RES_OLD_NODE_VER, in this case, it means the epoch it's to increment
and soon a new recovery work would replace the current one, we should
not waste time recovering for the out-of-date recovery work, what we
should do is to make the current recovery work cease to wait for replacement.

As for SD_RES_NETWORK_ERROR, currently, recover_object_from_replica() will
get SD_RES_NEWWORK_ERROR only if there's an EIO when reading the object,
in this case we should not make recovery retry, because next time it may
get an EIO either and so that make the recovery work hang there retrying
constantly, we should make it retry another copies or in another epoch.

Signed-off-by: levin li <xingke.lwp at taobao.com>
---
 sheep/recovery.c |   67 +++++++++++++++++++-----------------------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/sheep/recovery.c b/sheep/recovery.c
index 3fb34c0..48d1dd1 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -30,7 +30,7 @@ struct recovery_work {
 	uint32_t done;
 
 	struct timer timer;
-	int retry;
+	int cease;
 	struct work work;
 
 	int nr_blocking;
@@ -160,11 +160,9 @@ static int recover_object_from_replica(uint64_t oid,
 			ret = -1;
 			goto out;
 		}
-	} else if (rsp->result == SD_RES_NEW_NODE_VER ||
-			rsp->result == SD_RES_OLD_NODE_VER ||
-			rsp->result == SD_RES_NETWORK_ERROR) {
-		dprintf("retrying: %"PRIx32", %"PRIx64"\n", rsp->result, oid);
-		ret = 1;
+	} else if (rsp->result == SD_RES_OLD_NODE_VER) {
+		eprintf("ceasing, res: %"PRIx32"\n", rsp->result);
+		ret = rsp->result;
 		goto out;
 	} else {
 		eprintf("failed, res: %"PRIx32"\n", rsp->result);
@@ -221,7 +219,7 @@ static int do_recover_object(struct recovery_work *rw)
 	int old_nr = rw->old_nr_vnodes, cur_nr = rw->cur_nr_vnodes;
 	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1;
 	struct sd_vnode *tgt_entry;
-	int old_copies, cur_copies, ret, i, retry;
+	int old_copies, cur_copies, ret, i;
 
 	old = xmalloc(sizeof(*old) * SD_MAX_VNODES);
 	cur = xmalloc(sizeof(*cur) * SD_MAX_VNODES);
@@ -233,7 +231,6 @@ static int do_recover_object(struct recovery_work *rw)
 again:
 	dprintf("try recover object %"PRIx64" from epoch %"PRIu32"\n",
 		oid, tgt_epoch);
-	retry = 0;
 	/* Let's do a breadth-first search */
 	for (i = 0; i < old_copies; i++) {
 		int idx;
@@ -247,18 +244,12 @@ again:
 		if (ret == 0) {
 			/* Succeed */
 			break;
-		} else if (ret > 0) {
-			retry = 1;
-			/* Try our best to recover from peers */
-			continue;
+		} else if (SD_RES_OLD_NODE_VER == ret) {
+			rw->cease = 1;
+			goto err;
 		}
 	}
-	/* If not succeed but someone orders us to retry it, serve the order */
-	if (ret != 0 && retry == 1) {
-		ret = 0;
-		rw->retry = 1;
-		goto err;
-	}
+
 	/* No luck, roll back to an older configuration and try again */
 	if (ret < 0) {
 		struct sd_vnode *new_old;
@@ -312,19 +303,6 @@ static void recover_object(struct work *work)
 
 static struct recovery_work *suspended_recovery_work;
 
-static void recover_timer(void *data)
-{
-	struct recovery_work *rw = (struct recovery_work *)data;
-	uint64_t oid = rw->oids[rw->done];
-
-	if (is_access_to_busy_objects(oid)) {
-		suspended_recovery_work = rw;
-		return;
-	}
-
-	queue_work(sys->recovery_wqueue, &rw->work);
-}
-
 void resume_recovery_work(void)
 {
 	struct recovery_work *rw;
@@ -441,7 +419,7 @@ static void do_recover_main(struct work *work)
 		rw->state = RW_RUN;
 		recovered_oid = 0;
 		resume_wait_recovery_requests();
-	} else if (!rw->retry) {
+	} else if (!rw->cease){
 		rw->done++;
 		if (rw->nr_blocking > 0)
 			rw->nr_blocking--;
@@ -452,18 +430,14 @@ static void do_recover_main(struct work *work)
 	if (recovered_oid)
 		resume_wait_obj_requests(recovered_oid);
 
-	if (rw->retry && !next_rw) {
-		rw->retry = 0;
-
-		rw->timer.callback = recover_timer;
-		rw->timer.data = rw;
-		add_timer(&rw->timer, 2);
-		return;
-	}
-
 	if (rw->done < rw->count && !next_rw) {
 		rw->work.fn = recover_object;
 
+		if (rw->cease) {
+			flush_wait_obj_requests();
+			return;
+		}
+
 		if (is_access_to_busy_objects(oid)) {
 			suspended_recovery_work = rw;
 			return;
@@ -488,6 +462,7 @@ static void do_recover_main(struct work *work)
 		flush_wait_obj_requests();
 
 		recovering_work = rw;
+
 		queue_work(sys->recovery_wqueue, &rw->work);
 	} else {
 		if (sd_store->end_recover) {
@@ -606,9 +581,11 @@ static int newly_joined(struct sd_node *node, struct recovery_work *rw)
 	return 0;
 }
 
+#define MAX_RETRY 5
+
 static int fill_obj_list(struct recovery_work *rw)
 {
-	int i;
+	int i, retry = 0;
 	uint8_t *buf = NULL;
 	size_t buf_size = SD_DATA_OBJ_SIZE; /* FIXME */
 	int retry_cnt;
@@ -617,10 +594,14 @@ static int fill_obj_list(struct recovery_work *rw)
 	int start = random() % cur_nr;
 	int end = cur_nr;
 
+alloc:
 	buf = malloc(buf_size);
 	if (!buf) {
 		eprintf("out of memory\n");
-		rw->retry = 1;
+		if (retry++ < MAX_RETRY) {
+			sleep(1);
+			goto alloc;
+		}
 		return -1;
 	}
 
-- 
1.7.10