[Sheepdog] [PATCH 03/14] fix I/O accesses to multiple unrecovered objects

Sat May 14 09:03:48 CEST 2011

If clients access unrecovered objects, Sheepdog should recover them
first.  This fixes a bug which occurs when the number of such objects
are more than one.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/store.c |   47 +++++++++++++++++++++++++++++------------------
 1 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/sheep/store.c b/sheep/store.c
index 9f04ccc..c229679 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -1054,13 +1054,13 @@ struct recovery_work {
 	struct work work;
 	struct list_head rw_siblings;
 
+	int nr_blocking;
 	int count;
 	uint64_t *oids;
 };
 
 static LIST_HEAD(recovery_work_list);
 static struct recovery_work *recovering_work;
-static uint64_t blocking_oid;
 
 /*
  * find_tgt_node - find the node from which we should recover objects
@@ -1334,9 +1334,6 @@ static void recover_one(struct work *work, int idx)
 
 	eprintf("%"PRIu32" %"PRIu32", %16"PRIx64"\n", rw->done, rw->count, oid);
 
-	if (blocking_oid)
-		oid = blocking_oid; /* recover the blocked object first */
-
 	fd = ob_open(epoch, oid, 0, &ret);
 	if (fd != -1) {
 		/* the object is already recovered */
@@ -1417,7 +1414,6 @@ out:
 }
 
 static struct recovery_work *suspended_recovery_work;
-static uint64_t recovering_oid;
 
 static void __start_recovery(struct work *work, int idx);
 
@@ -1431,7 +1427,6 @@ static void recover_timer(void *data)
 		return;
 	}
 
-	recovering_oid = oid;
 	queue_work(&rw->work);
 }
 
@@ -1450,16 +1445,15 @@ void resume_recovery_work(void)
 		return;
 
 	suspended_recovery_work = NULL;
-	recovering_oid = oid;
 	queue_work(&rw->work);
 }
 
 int is_recoverying_oid(uint64_t oid)
 {
 	uint64_t hval = fnv_64a_buf(&oid, sizeof(uint64_t), FNV1A_64_INIT);
-	uint64_t recovering_hval = fnv_64a_buf(&recovering_oid, sizeof(uint64_t), FNV1A_64_INIT);
+	uint64_t min_hval;
 	struct recovery_work *rw = recovering_work;
-	int ret, fd;
+	int ret, fd, i;
 
 	if (oid == 0)
 		return 0;
@@ -1467,6 +1461,8 @@ int is_recoverying_oid(uint64_t oid)
 	if (!rw)
 		return 0; /* there is no thread working for object recovery */
 
+	min_hval = fnv_64a_buf(&rw->oids[rw->done + rw->nr_blocking], sizeof(uint64_t), FNV1A_64_INIT);
+
 	if (before(rw->epoch, sys->epoch))
 		return 1;
 
@@ -1480,11 +1476,27 @@ int is_recoverying_oid(uint64_t oid)
 		return 0;
 	}
 
-	if (recovering_hval <= hval) {
-		if (bsearch(&oid, rw->oids + rw->done,
-			    rw->count - rw->done, sizeof(oid), obj_cmp)) {
+	/* the first 'rw->nr_blocking' objects were already scheduled to be done earlier */
+	for (i = 0; i < rw->nr_blocking; i++)
+		if (rw->oids[rw->done + i] == oid)
+			return 1;
+
+	if (min_hval <= hval) {
+		uint64_t *p;
+		p = bsearch(&oid, rw->oids + rw->done + rw->nr_blocking,
+			    rw->count - rw->done - rw->nr_blocking, sizeof(oid), obj_cmp);
+		if (p) {
 			dprintf("recover the object %" PRIx64 " first\n", oid);
-			blocking_oid = oid;
+			if (rw->nr_blocking == 0)
+				rw->nr_blocking = 1; /* the first oid may be processed now */
+			if (p > rw->oids + rw->done + rw->nr_blocking) {
+				/* this object should be recovered earlier */
+				memmove(rw->oids + rw->done + rw->nr_blocking + 1,
+					rw->oids + rw->done + rw->nr_blocking,
+					sizeof(uint64_t) * (p - (rw->oids + rw->done + rw->nr_blocking)));
+				rw->oids[rw->done + rw->nr_blocking] = oid;
+				rw->nr_blocking++;
+			}
 			return 1;
 		}
 	}
@@ -1500,8 +1512,11 @@ static void recover_done(struct work *work, int idx)
 
 	if (rw->state == RW_INIT)
 		rw->state = RW_RUN;
-	else if (!rw->retry)
+	else if (!rw->retry) {
 		rw->done++;
+		if (rw->nr_blocking > 0)
+			rw->nr_blocking--;
+	}
 
 	oid = rw->oids[rw->done];
 
@@ -1514,8 +1529,6 @@ static void recover_done(struct work *work, int idx)
 		return;
 	}
 
-	blocking_oid = 0;
-
 	if (rw->done < rw->count && list_empty(&recovery_work_list)) {
 		rw->work.fn = recover_one;
 
@@ -1523,14 +1536,12 @@ static void recover_done(struct work *work, int idx)
 			suspended_recovery_work = rw;
 			return;
 		}
-		recovering_oid = oid;
 		resume_pending_requests();
 		queue_work(&rw->work);
 		return;
 	}
 
 	dprintf("recovery done, %"PRIu32"\n", rw->epoch);
-	recovering_oid = 0;
 	recovering_work = NULL;
 
 	sys->recovered_epoch = rw->epoch;
-- 
1.5.6.5