[sheepdog] [PATCH v3 11/11] sheep: fix dead lock problem of cluster_force_recover
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Fri Apr 19 10:55:30 CEST 2013
We cannot call exec_req in cluster_force_recover (in the main thread).
With this patch, sheep gets epoch info in the worker thread, and
notifies it to all the node.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
collie/cluster.c | 13 ++++++++---
sheep/group.c | 4 ++--
sheep/ops.c | 64 ++++++++++++++++++++++++++++++++++++++++--------------
sheep/sheep_priv.h | 2 ++
4 files changed, 62 insertions(+), 21 deletions(-)
diff --git a/collie/cluster.c b/collie/cluster.c
index da2effa..0c5ac13 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -325,7 +325,9 @@ static int cluster_force_recover(int argc, char **argv)
{
int ret;
struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
char str[123] = {'\0'};
+ struct sd_node nodes[SD_MAX_NODES];
if (!cluster_cmd_data.force) {
int i, l;
@@ -341,10 +343,15 @@ static int cluster_force_recover(int argc, char **argv)
}
sd_init_req(&hdr, SD_OP_FORCE_RECOVER);
+ hdr.data_length = sizeof(nodes);
- ret = send_light_req(&hdr, sdhost, sdport);
- if (ret) {
- fprintf(stderr, "failed to execute request\n");
+ ret = collie_exec_req(sdhost, sdport, &hdr, nodes);
+ if (ret < 0)
+ return EXIT_SYSFAIL;
+
+ if (rsp->result != SD_RES_SUCCESS) {
+ fprintf(stderr, "failed to execute request, %s\n",
+ sd_strerror(rsp->result));
return EXIT_FAILURE;
}
diff --git a/sheep/group.c b/sheep/group.c
index 1c2a9a9..3017164 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -166,8 +166,8 @@ void put_vnode_info(struct vnode_info *vnode_info)
}
}
-static struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
- size_t nr_nodes)
+struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
+ size_t nr_nodes)
{
struct vnode_info *vnode_info;
diff --git a/sheep/ops.c b/sheep/ops.c
index a6494b7..96b25eb 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -480,13 +480,10 @@ static int local_get_epoch(struct request *req)
return SD_RES_SUCCESS;
}
-static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp,
- void *data)
+static int cluster_force_recover_work(struct request *req)
{
- struct vnode_info *old_vnode_info, *vnode_info;
- int ret = SD_RES_SUCCESS;
- uint8_t c;
- uint16_t f;
+ struct vnode_info *old_vnode_info;
+ uint32_t epoch = sys_epoch();
/*
* We should manually recover the cluster when
@@ -494,8 +491,48 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp,
* 2) some nodes are physically down (same epoch condition).
* In both case, the nodes(s) stat is WAIT_FOR_JOIN.
*/
- if (sys->status != SD_STATUS_WAIT_FOR_JOIN)
+ if (sys->status != SD_STATUS_WAIT_FOR_JOIN || req->vinfo == NULL)
+ return SD_RES_FORCE_RECOVER;
+
+ old_vnode_info = get_vnode_info_epoch(epoch, req->vinfo);
+ if (!old_vnode_info) {
+ sd_printf(SDOG_EMERG, "cannot get vnode info for epoch %d",
+ epoch);
+ put_vnode_info(old_vnode_info);
return SD_RES_FORCE_RECOVER;
+ }
+
+ if (req->rq.data_length <
+ sizeof(*old_vnode_info->nodes) * old_vnode_info->nr_nodes) {
+ sd_eprintf("too small buffer size, %d", req->rq.data_length);
+ return SD_RES_INVALID_PARMS;
+ }
+
+ req->rp.epoch = epoch;
+ req->rp.data_length = sizeof(*old_vnode_info->nodes) *
+ old_vnode_info->nr_nodes;
+ memcpy(req->data, old_vnode_info->nodes, req->rp.data_length);
+
+ put_vnode_info(old_vnode_info);
+
+ return SD_RES_SUCCESS;
+}
+
+static int cluster_force_recover_main(const struct sd_req *req,
+ struct sd_rsp *rsp,
+ void *data)
+{
+ struct vnode_info *old_vnode_info, *vnode_info;
+ int ret = SD_RES_SUCCESS;
+ uint8_t c;
+ uint16_t f;
+ struct sd_node *nodes = data;
+ size_t nr_nodes = rsp->data_length / sizeof(*nodes);
+
+ if (rsp->epoch != sys->epoch) {
+ sd_eprintf("epoch was incremented while cluster_force_recover");
+ return SD_RES_FORCE_RECOVER;
+ }
ret = get_cluster_copies(&c);
if (ret) {
@@ -511,14 +548,6 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp,
sys->nr_copies = c;
sys->flags = f;
- vnode_info = get_vnode_info();
- old_vnode_info = get_vnode_info_epoch(sys->epoch, vnode_info);
- if (!old_vnode_info) {
- sd_printf(SDOG_EMERG, "cannot get vnode info for epoch %d",
- sys->epoch);
- goto err;
- }
-
sys->epoch++; /* some nodes are left, so we get a new epoch */
ret = log_current_epoch();
if (ret) {
@@ -531,6 +560,8 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp,
else
sys->status = SD_STATUS_HALT;
+ vnode_info = get_vnode_info();
+ old_vnode_info = alloc_vnode_info(nodes, nr_nodes);
start_recovery(vnode_info, old_vnode_info);
put_vnode_info(vnode_info);
put_vnode_info(old_vnode_info);
@@ -993,7 +1024,8 @@ static struct sd_op_template sd_ops[] = {
.name = "FORCE_RECOVER",
.type = SD_OP_TYPE_CLUSTER,
.force = true,
- .process_main = cluster_force_recover,
+ .process_work = cluster_force_recover_work,
+ .process_main = cluster_force_recover_main,
},
[SD_OP_SNAPSHOT] = {
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 56a5acc..6434acf 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -271,6 +271,8 @@ bool have_enough_zones(void);
struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
struct vnode_info *get_vnode_info(void);
void put_vnode_info(struct vnode_info *vinfo);
+struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
+ size_t nr_nodes);
struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
struct vnode_info *cur_vinfo);
void wait_get_vdis_done(void);
--
1.8.1.3.566.gaa39828
More information about the sheepdog
mailing list