[sheepdog] [PATCH V4 2/4] collie: add cluster recover {enable|disable} command

Wed Aug 1 09:19:20 CEST 2012

From: Yunkai Zhang <qiushu.zyk at taobao.com>

V4:
- fix some coding style(but reserve some lines overwrited 80 columns which is
  harmless I think)

V3:
- add description about nodes failure when disable recovery
---------------------------------------------------------- >8

= Why we need to disable recovery =
After disable recovery, we can add multiple nodes into cluster leisurely,
all joining nodes will be kept in an inner temporary list, current_vnode_info
will not be chaged. Only one recovery operation will be triggered when enable it
again, it can help me to reduce cluster's fluctuation.

Recovery will be executed immediately if there are nodes leaving from the
cluster even if we have disabled recovery, but the joining nodes kept in
the inner temporary list will not participate in this recovery, they will be
kept until we enable recovery again.

PS: the recovery is 'enable' by default.

= Usage =
1) Disable cluster recovery:
   $ collie cluster recover disable
    *Note*: Only disable the recovery caused by JOIN envets
    Cluster recovery: disable

2) Add multiple nodes into cluster
   ...

3) Enable cluster recovery:
   $ collie cluster recover enable
     Cluster recovery: enable

This patch only implements the command-line tools to update inner status, next
patch will do the real work.

Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
 collie/cluster.c         | 39 +++++++++++++++++++++++++++++++++++++++
 include/internal_proto.h |  2 ++
 sheep/ops.c              | 26 ++++++++++++++++++++++++++
 sheep/sheep_priv.h       |  1 +
 4 files changed, 68 insertions(+)

diff --git a/collie/cluster.c b/collie/cluster.c
index 089add9..6043578 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -379,10 +379,49 @@ static int cluster_force_recover(int argc, char **argv)
 	return EXIT_SUCCESS;
 }
 
+static int cluster_disable_recover(int argc, char **argv)
+{
+	int ret;
+	struct sd_req hdr;
+
+	sd_init_req(&hdr, SD_OP_DISABLE_RECOVER);
+	hdr.epoch = sd_epoch;
+
+	ret = send_light_req(&hdr, sdhost, sdport);
+	if (ret)
+		return EXIT_FAILURE;
+
+	printf("*Note*: Only disable the recovery caused by JOIN envets\n"
+	       "Cluster recovery: disable\n");
+	return EXIT_SUCCESS;
+}
+
+static int cluster_enable_recover(int argc, char **argv)
+{
+	int ret;
+	struct sd_req hdr;
+
+	sd_init_req(&hdr, SD_OP_ENABLE_RECOVER);
+	hdr.epoch = sd_epoch;
+
+	ret = send_light_req(&hdr, sdhost, sdport);
+	if (ret)
+		return EXIT_FAILURE;
+
+	printf("Cluster recovery: enable\n");
+	return EXIT_SUCCESS;
+}
+
 /* Subcommand list of recover */
 static struct subcommand cluster_recover_cmd[] = {
 	{"force", NULL, NULL, "force recover cluster immediately",
 	 NULL, 0, cluster_force_recover},
+	{"enable", NULL, NULL, "enable automatic recovery and "
+				"run once recover if necessary",
+	 NULL, 0, cluster_enable_recover},
+	{"disable", NULL, NULL, "disable automatic recovery caused "
+				"by JOIN events (excluding LEAVE events now)",
+	 NULL, 0, cluster_disable_recover},
 	{NULL},
 };
 
diff --git a/include/internal_proto.h b/include/internal_proto.h
index a6e54b8..1651f9c 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -61,6 +61,8 @@
 #define SD_OP_WRITE_PEER     0xA5
 #define SD_OP_REMOVE_PEER    0xA6
 #define SD_OP_SET_CACHE_SIZE 0xA7
+#define SD_OP_ENABLE_RECOVER 0xA8
+#define SD_OP_DISABLE_RECOVER 0xA9
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/sheep/ops.c b/sheep/ops.c
index 75df906..c6e33ce 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -267,6 +267,20 @@ static int cluster_shutdown(const struct sd_req *req, struct sd_rsp *rsp,
 	return SD_RES_SUCCESS;
 }
 
+static int cluster_enable_recover(const struct sd_req *req,
+				    struct sd_rsp *rsp, void *data)
+{
+	sys->disable_recovery = 0;
+	return SD_RES_SUCCESS;
+}
+
+static int cluster_disable_recover(const struct sd_req *req,
+				   struct sd_rsp *rsp, void *data)
+{
+	sys->disable_recovery = 1;
+	return SD_RES_SUCCESS;
+}
+
 static int cluster_get_vdi_attr(struct request *req)
 {
 	const struct sd_req *hdr = &req->rq;
@@ -1026,6 +1040,18 @@ static struct sd_op_template sd_ops[] = {
 		.type = SD_OP_TYPE_PEER,
 		.process_work = peer_remove_obj,
 	},
+
+	[SD_OP_ENABLE_RECOVER] = {
+		.name = "ENABLE_RECOVER",
+		.type = SD_OP_TYPE_CLUSTER,
+		.process_main = cluster_enable_recover,
+	},
+
+	[SD_OP_DISABLE_RECOVER] = {
+		.name = "DISABLE_RECOVER",
+		.type = SD_OP_TYPE_CLUSTER,
+		.process_main = cluster_disable_recover,
+	},
 };
 
 struct sd_op_template *get_sd_op(uint8_t opcode)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index c4225ea..998c846 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -118,6 +118,7 @@ struct cluster_info {
 
 	int use_directio;
 	uint8_t gateway_only;
+	uint8_t disable_recovery;
 
 	struct work_queue *gateway_wqueue;
 	struct work_queue *io_wqueue;
-- 
1.7.11.2