[sheepdog] [PATCH V3 3/4] sheep: do the real work of dsiable/enable recovery

Wed Aug 1 07:17:02 CEST 2012

From: Yunkai Zhang <qiushu.zyk at taobao.com>

V3:
- update commit log a bit
------------------------- >8

After disable recovery, all recovery operation in sd_join_handler will be
paused. current_vnode_info will not be updated before enable reovery again.
a disable_recovery variable was added in join_message so that joining sheep
can share cluster's recovery status. Joining nodes will be stored into
an inner temporary array which will be used when we enable reovery.

At most one recovery operation will be executed when user sending
"collie cluster recover enable" command. If there are no joining nodes to
recover, none recovery will be done.

Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
 include/internal_proto.h |  1 +
 sheep/group.c            | 75 ++++++++++++++++++++++++++++++++++++++----------
 sheep/ops.c              | 22 ++++++++++++++
 sheep/sheep_priv.h       |  7 +++++
 4 files changed, 90 insertions(+), 15 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 1651f9c..9819b08 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -198,6 +198,7 @@ struct join_message {
 	uint32_t epoch;
 	uint64_t ctime;
 	uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
+	uint8_t disable_recovery;
 	uint8_t store[STORE_LEN];
 
 	/*
diff --git a/sheep/group.c b/sheep/group.c
index 16cbdaf..238cbdb 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -38,6 +38,11 @@ struct vdi_bitmap_work {
 	struct sd_node members[];
 };
 
+struct sd_node joining_nodes[SD_MAX_NODES];
+size_t nr_joining_nodes = 0;
+struct sd_node all_nodes[SD_MAX_NODES];
+size_t nr_all_nodes = 0;
+
 static struct vnode_info *current_vnode_info;
 
 static size_t get_join_message_size(struct join_message *jm)
@@ -147,6 +152,17 @@ struct vnode_info *get_vnode_info(void)
 }
 
 /*
+ * update currently active vnode information structure,
+ * this must only be called from the main thread.
+ */
+void update_vnode_info(struct vnode_info *vnode_info)
+{
+
+	put_vnode_info(current_vnode_info);
+	current_vnode_info = vnode_info;
+}
+
+/*
  * Release a reference to the current vnode information.
  *
  * Must be called from the main thread.
@@ -184,7 +200,7 @@ void oid_to_vnodes(struct vnode_info *vnode_info, uint64_t oid, int nr_copies,
 	}
 }
 
-static struct vnode_info *alloc_vnode_info(struct sd_node *nodes,
+struct vnode_info *alloc_vnode_info(struct sd_node *nodes,
 					   size_t nr_nodes)
 {
 	struct vnode_info *vnode_info;
@@ -477,7 +493,7 @@ static void format_exceptional_node_list(struct join_message *jm)
 		jm->nodes[jm->nr_failed_nodes + jm->nr_delayed_nodes++] = n->ent;
 }
 
-static void clear_exceptional_node_lists(void)
+void clear_exceptional_node_lists(void)
 {
 	struct node *n, *t;
 
@@ -772,20 +788,42 @@ static void get_vdi_bitmap(struct sd_node *nodes, size_t nr_nodes)
 	queue_work(sys->block_wqueue, &w->work);
 }
 
+static void prepare_recovery(struct sd_node *joined,
+				    struct sd_node *nodes, size_t nr_nodes)
+{
+	int i;
+
+	joining_nodes[nr_joining_nodes++] = *joined;
+	if (!nr_all_nodes) {
+		/* exclude the newly added one */
+		for (i = 0; i < nr_nodes; i++) {
+			if (!node_eq(nodes + i, joined))
+				all_nodes[nr_all_nodes++] = nodes[i];
+		}
+	}
+
+	if (!current_vnode_info)
+		current_vnode_info = alloc_vnode_info(nodes, nr_nodes);
+}
+
 static void update_cluster_info(struct join_message *msg,
 				struct sd_node *joined, struct sd_node *nodes,
 				size_t nr_nodes)
 {
-	struct vnode_info *old_vnode_info;
+	struct vnode_info *old_vnode_info = NULL;
 
 	eprintf("status = %d, epoch = %d, finished: %d\n", msg->cluster_status,
 		msg->epoch, sys->join_finished);
 
+	sys->disable_recovery = msg->disable_recovery;
+
 	if (!sys->join_finished)
 		finish_join(msg, joined, nodes, nr_nodes);
 
-	old_vnode_info = current_vnode_info;
-	current_vnode_info = alloc_vnode_info(nodes, nr_nodes);
+	if (!sys->disable_recovery) {
+		old_vnode_info = current_vnode_info;
+		current_vnode_info = alloc_vnode_info(nodes, nr_nodes);
+	}
 
 	switch (msg->cluster_status) {
 	case SD_STATUS_OK:
@@ -809,16 +847,21 @@ static void update_cluster_info(struct join_message *msg,
 		sys->status = msg->cluster_status;
 
 		if (msg->inc_epoch) {
-			uatomic_inc(&sys->epoch);
-			log_current_epoch();
-			clear_exceptional_node_lists();
-
-			if (!old_vnode_info) {
-				old_vnode_info = alloc_old_vnode_info(joined,
-						nodes, nr_nodes);
-			}
-
-			start_recovery(current_vnode_info, old_vnode_info);
+			if (!sys->disable_recovery) {
+				uatomic_inc(&sys->epoch);
+				log_current_epoch();
+				clear_exceptional_node_lists();
+
+				if (!old_vnode_info) {
+					old_vnode_info =
+						alloc_old_vnode_info( joined,
+							nodes, nr_nodes);
+				}
+
+				start_recovery(current_vnode_info,
+					       old_vnode_info);
+			} else
+				prepare_recovery(joined, nodes, nr_nodes);
 		}
 
 		if (have_enough_zones())
@@ -894,6 +937,7 @@ enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void *opaque)
 		vprintf(SDOG_DEBUG, "%s\n", node_to_str(&sys->this_node));
 
 		jm->cluster_status = sys->status;
+		jm->disable_recovery = sys->disable_recovery;
 
 		epoch = get_latest_epoch();
 		if (!epoch)
@@ -918,6 +962,7 @@ enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void *opaque)
 	}
 
 	jm->cluster_status = sys->status;
+	jm->disable_recovery = sys->disable_recovery;
 	jm->inc_epoch = 0;
 
 	switch (sys->status) {
diff --git a/sheep/ops.c b/sheep/ops.c
index c6e33ce..af8d373 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -270,6 +270,28 @@ static int cluster_shutdown(const struct sd_req *req, struct sd_rsp *rsp,
 static int cluster_enable_recover(const struct sd_req *req,
 				    struct sd_rsp *rsp, void *data)
 {
+	int i;
+	struct vnode_info *old_vnode_info, *vnode_info;
+
+	if (nr_joining_nodes) {
+
+		for (i = 0; i < nr_joining_nodes; i++)
+			all_nodes[nr_all_nodes++] = joining_nodes[i];
+
+		old_vnode_info = get_vnode_info();
+		vnode_info = alloc_vnode_info(all_nodes, nr_all_nodes);
+		update_vnode_info(vnode_info);
+
+		uatomic_inc(&sys->epoch);
+		log_current_epoch();
+		clear_exceptional_node_lists();
+
+		start_recovery(vnode_info, old_vnode_info);
+
+		put_vnode_info(old_vnode_info);
+	}
+
+	nr_joining_nodes = 0;
 	sys->disable_recovery = 0;
 	return SD_RES_SUCCESS;
 }
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 998c846..0c0e588 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -184,6 +184,10 @@ extern char *jrnl_path;
 extern char *epoch_path;
 extern mode_t def_fmode;
 extern mode_t def_dmode;
+extern struct sd_node joining_nodes[];
+extern size_t nr_joining_nodes;
+extern struct sd_node all_nodes[];
+extern size_t nr_all_nodes;
 
 /* One should call this function to get sys->epoch outside main thread */
 static inline uint32_t sys_epoch(void)
@@ -216,8 +220,11 @@ int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
 		void *data);
 
 bool have_enough_zones(void);
+void clear_exceptional_node_lists(void);
 struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
 struct vnode_info *get_vnode_info(void);
+void update_vnode_info(struct vnode_info *vnode_info);
+struct vnode_info *alloc_vnode_info(struct sd_node *nodes, size_t nr_nodes);
 void put_vnode_info(struct vnode_info *vnodes);
 struct vnode_info *get_vnode_info_epoch(uint32_t epoch);
 
-- 
1.7.11.2