[sheepdog] [PATCH v2 3/4] sheep: add support for manual recovery

Wed Sep 12 09:05:06 CEST 2012

This implements disbling object recovery.  When recoery is disabled,
sheep will suspend the recovery process after it recovers objects in
the prio_oids queue.  The suspended recovery is resumed after new
objects are pushed into the prio_oids queue.  This means that
unaccessed objects are not recovered at all.

Note that sheep increments epoch even when recovery is disabled.  If
sheep receives a write request, it will place the updated object based
on the current node membership.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/cluster.c         |    3 +-
 include/internal_proto.h |    4 ++-
 sheep/group.c            |    2 +
 sheep/ops.c              |    1 +
 sheep/recovery.c         |   63 ++++++++++++++++++++++++++++++++++++++++------
 sheep/sheep_priv.h       |    1 +
 6 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/collie/cluster.c b/collie/cluster.c
index 7ebbb83..9774c7f 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -386,8 +386,7 @@ static int cluster_disable_recover(int argc, char **argv)
 	if (ret)
 		return EXIT_FAILURE;
 
-	printf("*Note*: Only disable the recovery caused by JOIN envets\n"
-	       "Cluster recovery: disable\n");
+	printf("Cluster recovery: disable\n");
 	return EXIT_SUCCESS;
 }
 
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 2fb4a0a..b4199ca 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -197,11 +197,13 @@ struct join_message {
 	int16_t nr_nodes;
 	uint16_t nr_failed_nodes;
 	uint16_t nr_delayed_nodes;
-	uint16_t cluster_flags;
 	uint32_t cluster_status;
 	uint32_t epoch;
 	uint64_t ctime;
 	uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
+	uint8_t disable_recovery;
+	uint16_t cluster_flags;
+	uint32_t __pad;
 	uint8_t store[STORE_LEN];
 
 	/*
diff --git a/sheep/group.c b/sheep/group.c
index 0a3fe08..94466f3 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -836,6 +836,7 @@ static void update_cluster_info(struct join_message *msg,
 			set_cluster_ctime(msg->ctime);
 			/*FALLTHROUGH*/
 		case SD_STATUS_WAIT_FOR_JOIN:
+			sys->disable_recovery = msg->disable_recovery;
 			get_vdis(nodes, nr_nodes);
 			break;
 		default:
@@ -994,6 +995,7 @@ enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void *opaque)
 	jm->epoch = sys->epoch;
 	jm->ctime = get_cluster_ctime();
 	jm->nr_failed_nodes = 0;
+	jm->disable_recovery = sys->disable_recovery;
 
 	if (sd_store)
 		strcpy((char *)jm->store, sd_store->name);
diff --git a/sheep/ops.c b/sheep/ops.c
index a96fe41..4542335 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -318,6 +318,7 @@ static int cluster_enable_recover(const struct sd_req *req,
 				    struct sd_rsp *rsp, void *data)
 {
 	sys->disable_recovery = 0;
+	resume_suspended_recovery();
 	return SD_RES_SUCCESS;
 }
 
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 4df5b66..eb70296 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -30,11 +30,14 @@ struct recovery_work {
 
 	int stop;
 	struct work work;
+	bool suspended; /* true when automatic recovery is disabled
+			 * and recovery process is suspended */
 
 	int count;
 	uint64_t *oids;
 	uint64_t *prio_oids;
 	int nr_prio_oids;
+	int nr_scheduled_prio_oids;
 
 	struct vnode_info *old_vinfo;
 	struct vnode_info *cur_vinfo;
@@ -269,13 +272,15 @@ static inline void prepare_schedule_oid(uint64_t oid)
 				oid);
 			return;
 		}
-	/* The oid is currently being recovered */
-	if (rw->oids[rw->done] == oid)
+	/* When auto recovery is enabled, the oid is currently being
+	 * recovered */
+	if (!sys->disable_recovery && rw->oids[rw->done] == oid)
 		return;
 	rw->nr_prio_oids++;
 	rw->prio_oids = xrealloc(rw->prio_oids,
 				 rw->nr_prio_oids * sizeof(uint64_t));
 	rw->prio_oids[rw->nr_prio_oids - 1] = oid;
+	resume_suspended_recovery();
 
 	dprintf("%"PRIx64" nr_prio_oids %d\n", oid, rw->nr_prio_oids);
 }
@@ -431,9 +436,51 @@ static inline void finish_schedule_oids(struct recovery_work *rw)
 done:
 	free(rw->prio_oids);
 	rw->prio_oids = NULL;
+	rw->nr_scheduled_prio_oids += rw->nr_prio_oids;
 	rw->nr_prio_oids = 0;
 }
 
+/*
+ * When automatic object recovery is disabled, the behavior of the
+ * recovery process is like 'lazy recovery'.  This function returns
+ * true if the recovery queue contains objects being accessed by
+ * clients.  Sheep recovers such objects for availability even when
+ * automatic object recovery is not enabled.
+ */
+static bool has_scheduled_objects(struct recovery_work *rw)
+{
+	return rw->nr_scheduled_prio_oids <= rw->done;
+}
+
+static void recover_next_object(struct recovery_work *rw)
+{
+	if (next_rw) {
+		run_next_rw(rw);
+		return;
+	}
+
+	if (rw->nr_prio_oids)
+		finish_schedule_oids(rw);
+
+	if (sys->disable_recovery && has_scheduled_objects(rw)) {
+		dprintf("suspended\n");
+		rw->suspended = true;
+		/* suspend until resume_suspended_recovery() is called */
+		return;
+	}
+
+	/* Try recover next object */
+	queue_work(sys->recovery_wqueue, &rw->work);
+}
+
+void resume_suspended_recovery(void)
+{
+	if (recovering_work && recovering_work->suspended) {
+		recovering_work->suspended = false;
+		recover_next_object(recovering_work);
+	}
+}
+
 static void recover_object_main(struct work *work)
 {
 	struct recovery_work *rw = container_of(work, struct recovery_work,
@@ -457,11 +504,7 @@ static void recover_object_main(struct work *work)
 	resume_wait_obj_requests(rw->oids[rw->done++]);
 
 	if (rw->done < rw->count) {
-		if (rw->nr_prio_oids)
-			finish_schedule_oids(rw);
-
-		/* Try recover next object */
-		queue_work(sys->recovery_wqueue, &rw->work);
+		recover_next_object(rw);
 		return;
 	}
 
@@ -490,7 +533,7 @@ static void finish_object_list(struct work *work)
 	resume_wait_recovery_requests();
 	rw->work.fn = recover_object_work;
 	rw->work.done = recover_object_main;
-	queue_work(sys->recovery_wqueue, &rw->work);
+	recover_next_object(rw);
 	return;
 }
 
@@ -673,6 +716,10 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo)
 			free_recovery_work(next_rw);
 		dprintf("recovery skipped\n");
 		next_rw = rw;
+
+		/* This is necesary to invoke run_next_rw when
+		 * recovery work is suspended. */
+		resume_suspended_recovery();
 	} else {
 		recovering_work = rw;
 		queue_work(sys->recovery_wqueue, &rw->work);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index bb7f5fe..bfc5b9d 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -255,6 +255,7 @@ void resume_wait_epoch_requests(void);
 void resume_wait_obj_requests(uint64_t oid);
 void resume_wait_recovery_requests(void);
 void flush_wait_obj_requests(void);
+void resume_suspended_recovery(void);
 
 int create_cluster(int port, int64_t zone, int nr_vnodes,
 		   bool explicit_addr);
-- 
1.7.2.5