[Sheepdog] [PATCH 4/8] sheep: avoid processing I/O until the target object is recovered

Sat Aug 28 22:55:30 CEST 2010

I/O operations fail always if the target objects are not recovered
from the previous node membership change, so such operations should be
delayed until the object is recovered.

This patch sorts the order of recovering objects and makes it easy to
check whether the object is recovered or not.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/group.c      |    1 +
 sheep/sheep_priv.h |    2 ++
 sheep/store.c      |   23 ++++++++++++++++++++++-
 3 files changed, 25 insertions(+), 1 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 806c780..2e378a4 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -775,6 +775,7 @@ static void vdi_op_done(struct vdi_op_message *msg)
 			remove_epoch(i);
 
 		sys->epoch = 1;
+		sys->recovered_epoch = 1;
 		nr_nodes = get_ordered_sd_node_list(entry);
 
 		dprintf("write epoch log, %d, %d\n", sys->epoch, nr_nodes);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index ce2e0e6..05c0052 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -120,6 +120,8 @@ struct cluster_info {
 	struct cpg_event *cur_cevent;
 	unsigned long cpg_event_work_flags;
 	int nr_outstanding_io;
+
+	uint32_t recovered_epoch;
 };
 
 extern struct cluster_info *sys;
diff --git a/sheep/store.c b/sheep/store.c
index 7ab7dfa..b777602 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -36,6 +36,18 @@ static char *mnt_path;
 static mode_t def_dmode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP;
 static mode_t def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 
+static int obj_cmp(const void *oid1, const void *oid2)
+{
+	const uint64_t hval1 = fnv_64a_buf((void *)oid1, sizeof(uint64_t), FNV1A_64_INIT);
+	const uint64_t hval2 = fnv_64a_buf((void *)oid2, sizeof(uint64_t), FNV1A_64_INIT);
+
+	if (hval1 < hval2)
+		return -1;
+	if (hval1 > hval2)
+		return 1;
+	return 0;
+}
+
 static int stat_sheep(uint64_t *store_size, uint64_t *store_free, uint32_t epoch)
 {
 	struct statvfs vs;
@@ -1326,7 +1338,11 @@ void resume_recovery_work(void)
 
 int is_recoverying_oid(uint64_t oid)
 {
-	return recovering_oid && recovering_oid == oid;
+	uint64_t hval = fnv_64a_buf(&oid, sizeof(uint64_t), FNV1A_64_INIT);
+	uint64_t recovering_hval = fnv_64a_buf(&recovering_oid, sizeof(uint64_t), FNV1A_64_INIT);
+
+	return before(sys->recovered_epoch, sys->epoch - 1) ||
+		(sys->recovered_epoch == sys->epoch - 1 && recovering_hval <= hval);
 }
 
 static void recover_done(struct work *work, int idx)
@@ -1362,6 +1378,9 @@ static void recover_done(struct work *work, int idx)
 	dprintf("recovery done, %"PRIu32"\n", rw->epoch);
 	recovering = 0;
 
+	sys->recovered_epoch = rw->epoch;
+	resume_pending_requests();
+
 	free(rw->buf);
 	free(rw->failed_vdis);
 	free(rw);
@@ -1521,6 +1540,8 @@ static void __start_recovery(struct work *work, int idx)
 	if (rw->retry)
 		goto fail;
 
+	qsort(rw->buf, rw->count, sizeof(uint64_t), obj_cmp);
+
 	snprintf(path, sizeof(path), "%s%08u/list", obj_path, epoch);
 	dprintf("write object list file to %s\n", path);
 
-- 
1.5.6.5