[sheepdog] [PATCH v3 11/11] sheep: bump SD_MAX_NODES from 1024 to 6144

Sat Sep 21 18:12:28 CEST 2013

This actually means we can support SD_MAX_NODES with zookeeper driver.

For simplicity, local and corosync drivers support at most 1024 nodes.

We are still tied to a fixed number sheepdog can support, but now with a much
larger value.

The real limit is determined by the cluster driver with the current node
management code, that is, how much payload of broadcast message the driver can
send determined SD_MAX_NODES.

With 6144(6*1024), zookeeper's largest message is 983080 bytes, close to 1M that
is its max allowed message size. This means we can actually increase
SD_MAX_NODES beyound 6144 a little bit.

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 include/internal_proto.h  |   13 +++++++++++--
 sheep/cluster/zookeeper.c |   10 ++++++++--
 sheep/group.c             |   22 ++++++++++++++++++----
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index c02e066..59c6e2a 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -28,7 +28,16 @@
 #define SD_DEFAULT_COPIES 3
 #define SD_MAX_COPIES 8
 
-#define SD_MAX_NODES 1024
+/*
+ * The max number of nodes sheep daemon can support is constrained by
+ * the number of nodes in the struct cluster_info, but the actual max
+ * number is determined by the cluster driver because we have to pass
+ * sys->cinfo around the cluster to handle membership management.
+ *
+ * Currently, only zookeeper driver support SD_MAX_NODES nodes because
+ * its message buffer size is large enough to hold nodes[SD_MAX_NODES].
+ */
+#define SD_MAX_NODES 6144
 #define SD_DEFAULT_VNODES 128
 
 /*
@@ -150,7 +159,7 @@ struct cluster_info {
 	uint32_t __pad;
 	uint8_t store[STORE_LEN];
 
-	/* node list at cluster_info->epoch */
+	/* Node list at cluster_info->epoch */
 	struct sd_node nodes[SD_MAX_NODES];
 };
 
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 83e5a57..7ce8180 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -554,19 +554,25 @@ static void zk_watcher(zhandle_t *zh, int type, int state, const char *path,
 /*
  * We placehold the enough space to piggyback the nodes information on join
  * response message so that every node can see the same membership view.
+ *
+ * We have to preallocate enough space and set msg_len as
+ * sizeof(struct cluster_info) because of piggyback.
  */
-static int add_join_event(void *msg, size_t msg_len)
+static int add_join_event(void *msg, size_t msglen)
 {
 	struct zk_event ev;
+	size_t msg_len = sizeof(struct cluster_info);
 	size_t len = msg_len + sizeof(struct sd_node) * SD_MAX_NODES;
 
+	if (unlikely((offsetof(struct zk_event, buf) + len) > ZK_MAX_BUF_SIZE))
+		panic("Zookeeper can't send message more than 1M");
 	ev.id = get_uniq_id();
 	ev.type = EVENT_JOIN;
 	ev.sender = this_node;
 	ev.msg_len = msg_len;
 	ev.buf_len = len;
 	if (msg)
-		memcpy(ev.buf, msg, msg_len);
+		memcpy(ev.buf, msg, msglen);
 	return zk_queue_push(&ev);
 }
 
diff --git a/sheep/group.c b/sheep/group.c
index 35a930c..1257fad 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -410,6 +410,18 @@ static bool enough_nodes_gathered(struct cluster_info *cinfo,
 	return true;
 }
 
+/*
+ * We have to use memcpy beause some cluster drivers like corosync can't support
+ * to send the whole cluster_info structure.
+ */
+static void cluster_info_copy(struct cluster_info *dst,
+			      const struct cluster_info *src)
+{
+	int len = offsetof(struct cluster_info, nodes) +
+		src->nr_nodes * sizeof(struct sd_node);
+	memcpy(dst, src, len);
+}
+
 static enum sd_status cluster_wait_check(const struct sd_node *joining,
 					 const struct rb_root *nroot,
 					 size_t nr_nodes,
@@ -423,7 +435,7 @@ static enum sd_status cluster_wait_check(const struct sd_node *joining,
 	if (cinfo->epoch > sys->cinfo.epoch) {
 		sd_debug("joining node has a larger epoch, %" PRIu32 ", %"
 			 PRIu32, cinfo->epoch, sys->cinfo.epoch);
-		sys->cinfo = *cinfo;
+		cluster_info_copy(&sys->cinfo, cinfo);
 	}
 
 	/*
@@ -746,7 +758,7 @@ main_fn bool sd_join_handler(const struct sd_node *joining,
 	else
 		status = sys->cinfo.status;
 
-	*cinfo = sys->cinfo;
+	cluster_info_copy(cinfo, &sys->cinfo);
 	cinfo->status = status;
 	cinfo->proto_ver = SD_SHEEP_PROTO_VER;
 
@@ -760,9 +772,11 @@ main_fn bool sd_join_handler(const struct sd_node *joining,
 static int send_join_request(void)
 {
 	struct sd_node *n = &sys->this_node;
+	int len = offsetof(struct cluster_info, nodes) +
+		sys->cinfo.nr_nodes * sizeof(struct sd_node);
 
 	sd_info("%s", node_to_str(n));
-	return sys->cdrv->join(n, &sys->cinfo, sizeof(sys->cinfo));
+	return sys->cdrv->join(n, &sys->cinfo, len);
 }
 
 static void requeue_cluster_request(void)
@@ -879,7 +893,7 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
 		exit(1);
 	}
 
-	sys->cinfo = *cinfo;
+	cluster_info_copy(&sys->cinfo, cinfo);
 
 	sd_debug("join %s", node_to_str(joined));
 	rb_for_each_entry(n, nroot, rb) {
-- 
1.7.9.5