[sheepdog] [PATCH v3 11/11] sheep: fix dead lock problem of cluster_force_recover

Fri Apr 19 10:55:30 CEST 2013

We cannot call exec_req in cluster_force_recover (in the main thread).
With this patch, sheep gets epoch info in the worker thread, and
notifies it to all the node.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/cluster.c   | 13 ++++++++---
 sheep/group.c      |  4 ++--
 sheep/ops.c        | 64 ++++++++++++++++++++++++++++++++++++++++--------------
 sheep/sheep_priv.h |  2 ++
 4 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/collie/cluster.c b/collie/cluster.c
index da2effa..0c5ac13 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -325,7 +325,9 @@ static int cluster_force_recover(int argc, char **argv)
 {
 	int ret;
 	struct sd_req hdr;
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	char str[123] = {'\0'};
+	struct sd_node nodes[SD_MAX_NODES];
 
 	if (!cluster_cmd_data.force) {
 		int i, l;
@@ -341,10 +343,15 @@ static int cluster_force_recover(int argc, char **argv)
 	}
 
 	sd_init_req(&hdr, SD_OP_FORCE_RECOVER);
+	hdr.data_length = sizeof(nodes);
 
-	ret = send_light_req(&hdr, sdhost, sdport);
-	if (ret) {
-		fprintf(stderr, "failed to execute request\n");
+	ret = collie_exec_req(sdhost, sdport, &hdr, nodes);
+	if (ret < 0)
+		return EXIT_SYSFAIL;
+
+	if (rsp->result != SD_RES_SUCCESS) {
+		fprintf(stderr, "failed to execute request, %s\n",
+			sd_strerror(rsp->result));
 		return EXIT_FAILURE;
 	}
 
diff --git a/sheep/group.c b/sheep/group.c
index 1c2a9a9..3017164 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -166,8 +166,8 @@ void put_vnode_info(struct vnode_info *vnode_info)
 	}
 }
 
-static struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
-					   size_t nr_nodes)
+struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
+				    size_t nr_nodes)
 {
 	struct vnode_info *vnode_info;
 
diff --git a/sheep/ops.c b/sheep/ops.c
index a6494b7..96b25eb 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -480,13 +480,10 @@ static int local_get_epoch(struct request *req)
 	return SD_RES_SUCCESS;
 }
 
-static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp,
-				void *data)
+static int cluster_force_recover_work(struct request *req)
 {
-	struct vnode_info *old_vnode_info, *vnode_info;
-	int ret = SD_RES_SUCCESS;
-	uint8_t c;
-	uint16_t f;
+	struct vnode_info *old_vnode_info;
+	uint32_t epoch = sys_epoch();
 
 	/*
 	 * We should manually recover the cluster when
@@ -494,8 +491,48 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp,
 	 * 2) some nodes are physically down (same epoch condition).
 	 * In both case, the nodes(s) stat is WAIT_FOR_JOIN.
 	 */
-	if (sys->status != SD_STATUS_WAIT_FOR_JOIN)
+	if (sys->status != SD_STATUS_WAIT_FOR_JOIN || req->vinfo == NULL)
+		return SD_RES_FORCE_RECOVER;
+
+	old_vnode_info = get_vnode_info_epoch(epoch, req->vinfo);
+	if (!old_vnode_info) {
+		sd_printf(SDOG_EMERG, "cannot get vnode info for epoch %d",
+			  epoch);
+		put_vnode_info(old_vnode_info);
 		return SD_RES_FORCE_RECOVER;
+	}
+
+	if (req->rq.data_length <
+	    sizeof(*old_vnode_info->nodes) * old_vnode_info->nr_nodes) {
+		sd_eprintf("too small buffer size, %d", req->rq.data_length);
+		return SD_RES_INVALID_PARMS;
+	}
+
+	req->rp.epoch = epoch;
+	req->rp.data_length = sizeof(*old_vnode_info->nodes) *
+		old_vnode_info->nr_nodes;
+	memcpy(req->data, old_vnode_info->nodes, req->rp.data_length);
+
+	put_vnode_info(old_vnode_info);
+
+	return SD_RES_SUCCESS;
+}
+
+static int cluster_force_recover_main(const struct sd_req *req,
+				      struct sd_rsp *rsp,
+				      void *data)
+{
+	struct vnode_info *old_vnode_info, *vnode_info;
+	int ret = SD_RES_SUCCESS;
+	uint8_t c;
+	uint16_t f;
+	struct sd_node *nodes = data;
+	size_t nr_nodes = rsp->data_length / sizeof(*nodes);
+
+	if (rsp->epoch != sys->epoch) {
+		sd_eprintf("epoch was incremented while cluster_force_recover");
+		return SD_RES_FORCE_RECOVER;
+	}
 
 	ret = get_cluster_copies(&c);
 	if (ret) {
@@ -511,14 +548,6 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp,
 	sys->nr_copies = c;
 	sys->flags = f;
 
-	vnode_info = get_vnode_info();
-	old_vnode_info = get_vnode_info_epoch(sys->epoch, vnode_info);
-	if (!old_vnode_info) {
-		sd_printf(SDOG_EMERG, "cannot get vnode info for epoch %d",
-			  sys->epoch);
-		goto err;
-	}
-
 	sys->epoch++; /* some nodes are left, so we get a new epoch */
 	ret = log_current_epoch();
 	if (ret) {
@@ -531,6 +560,8 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp,
 	else
 		sys->status = SD_STATUS_HALT;
 
+	vnode_info = get_vnode_info();
+	old_vnode_info = alloc_vnode_info(nodes, nr_nodes);
 	start_recovery(vnode_info, old_vnode_info);
 	put_vnode_info(vnode_info);
 	put_vnode_info(old_vnode_info);
@@ -993,7 +1024,8 @@ static struct sd_op_template sd_ops[] = {
 		.name = "FORCE_RECOVER",
 		.type = SD_OP_TYPE_CLUSTER,
 		.force = true,
-		.process_main = cluster_force_recover,
+		.process_work = cluster_force_recover_work,
+		.process_main = cluster_force_recover_main,
 	},
 
 	[SD_OP_SNAPSHOT] = {
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 56a5acc..6434acf 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -271,6 +271,8 @@ bool have_enough_zones(void);
 struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
 struct vnode_info *get_vnode_info(void);
 void put_vnode_info(struct vnode_info *vinfo);
+struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
+				    size_t nr_nodes);
 struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
 					struct vnode_info *cur_vinfo);
 void wait_get_vdis_done(void);
-- 
1.8.1.3.566.gaa39828