[sheepdog] [PATCH, RFC] sheep: send the vdi inuse bitmap in the join message

Wed May 23 20:12:14 CEST 2012

Currently the post-join even handler tries to contact one or more sheep
to get the current vdi inuse bitmap.  In addition to adding network round
trips this has the problem that we need to properly block all vdi lookups
until this work has completed, which doesn't work properly, as well as
writing to the vdi bitmap both from the event worker thread as well as
the main thread.

The simples fix (so simple that it removes 90 lines of code) is add the vdi
inuse bitmap to the join message, and simply use the bitmap from the join
message in any newly joining node.  The downside is that this increases the
size of the join message by a large amount, which I'm requires increasing
the size of the event buffer.

So far I've only tested this with the local driver as my test cluster is
busy, but I'd love to get some feedback on this design.

Signed-off-by: Christoph Hellwig <hch at lst.de>

---
 sheep/cluster.h |    2 -
 sheep/group.c   |  103 +++-----------------------------------------------------
 2 files changed, 7 insertions(+), 98 deletions(-)

Index: sheepdog/sheep/group.c
===================================================================

--- sheepdog.orig/sheep/group.c	2012-05-23 19:51:21.516139460 +0200
+++ sheepdog/sheep/group.c	2012-05-23 19:57:18.328148597 +0200
@@ -50,6 +50,7 @@ struct join_message {
 	uint32_t result;
 	uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
 	uint8_t store[STORE_LEN];
+	DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
 	union {
 		struct sd_node nodes[0];
 		struct sd_node leave_nodes[0];
@@ -73,9 +74,6 @@ struct work_notify {
 
 struct work_join {
 	struct event_struct cev;
-
-	struct sd_node *member_list;
-	size_t member_list_entries;
 	struct sd_node joined;
 
 	struct join_message *jm;
@@ -502,53 +500,6 @@ out:
 	return ret;
 }
 
-static int get_vdi_bitmap_from(struct sd_node *node)
-{
-	struct sd_req hdr;
-	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-	static DECLARE_BITMAP(tmp_vdi_inuse, SD_NR_VDIS);
-	int fd, i, ret = SD_RES_SUCCESS;
-	unsigned int rlen, wlen;
-	char host[128];
-
-	if (is_myself(node->addr, node->port))
-		goto out;
-
-	addr_to_str(host, sizeof(host), node->addr, 0);
-
-	fd = connect_to(host, node->port);
-	if (fd < 0) {
-		vprintf(SDOG_ERR, "unable to get the VDI bitmap from %s: %m\n", host);
-		ret = -SD_RES_EIO;
-		goto out;
-	}
-
-	vprintf(SDOG_ERR, "%s:%d\n", host, node->port);
-
-	memset(&hdr, 0, sizeof(hdr));
-	hdr.opcode = SD_OP_READ_VDIS;
-	hdr.epoch = sys->epoch;
-	hdr.data_length = sizeof(tmp_vdi_inuse);
-	rlen = hdr.data_length;
-	wlen = 0;
-
-	ret = exec_req(fd, &hdr, (char *)tmp_vdi_inuse,
-			&wlen, &rlen);
-
-	close(fd);
-
-	if (ret || rsp->result != SD_RES_SUCCESS) {
-		vprintf(SDOG_ERR, "unable to get the VDI bitmap (%d, %d)\n", ret,
-				rsp->result);
-		goto out;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(sys->vdi_inuse); i++)
-		sys->vdi_inuse[i] |= tmp_vdi_inuse[i];
-out:
-	return ret;
-}
-
 static void update_node_info(struct sd_node *nodes, size_t nr_nodes)
 {
 	print_node_list(nodes, nr_nodes);
@@ -664,10 +615,6 @@ static void update_cluster_info(struct j
 	}
 }
 
-static void __sd_notify(struct event_struct *cevent)
-{
-}
-
 static void __sd_notify_done(struct event_struct *cevent)
 {
 	struct work_notify *w = container_of(cevent, struct work_notify, cev);
@@ -765,35 +712,6 @@ static int check_majority(struct sd_node
 	return 0;
 }
 
-static void __sd_join(struct event_struct *cevent)
-{
-	struct work_join *w = container_of(cevent, struct work_join, cev);
-	struct join_message *msg = w->jm;
-	int i;
-
-	if (msg->cluster_status != SD_STATUS_OK &&
-	    msg->cluster_status != SD_STATUS_HALT)
-		return;
-
-	if (sys_stat_ok())
-		return;
-
-	for (i = 0; i < w->member_list_entries; i++) {
-		/* We should not fetch vdi_bitmap from myself */
-		if (node_eq(w->member_list + i, &sys->this_node))
-			continue;
-
-		get_vdi_bitmap_from(w->member_list + i);
-
-		/*
-		 * If a new comer try to join the running cluster, it only
-		 * need read one copy of bitmap from one of other members.
-		 */
-		if (sys_stat_wait_format())
-			break;
-	}
-}
-
 static void __sd_leave(struct event_struct *cevent)
 {
 	struct work_leave *w = container_of(cevent, struct work_leave, cev);
@@ -851,6 +769,7 @@ enum cluster_join_result sd_check_join_c
 	jm->cluster_flags = sys->flags;
 	jm->ctime = get_cluster_ctime();
 	jm->nr_leave_nodes = 0;
+	memcpy(&jm->vdi_inuse, &sys->vdi_inuse, sizeof(sys->vdi_inuse));
 
 	if (sd_store)
 		strcpy((char *)jm->store, sd_store->name);
@@ -916,6 +835,9 @@ static void __sd_join_done(struct event_
 
 	print_node_list(sys->nodes, sys->nr_nodes);
 
+	if (node_eq(&w->joined, &sys->this_node))
+		memcpy(&sys->vdi_inuse, &jm->vdi_inuse, sizeof(sys->vdi_inuse));
+
 	sys_stat_set(jm->cluster_status);
 
 	if (sys_can_recover() && jm->inc_epoch) {
@@ -951,7 +873,6 @@ static void event_free(struct event_stru
 	switch (cevent->ctype) {
 	case EVENT_JOIN: {
 		struct work_join *w = container_of(cevent, struct work_join, cev);
-		free(w->member_list);
 		free(w->jm);
 		free(w);
 		break;
@@ -986,14 +907,10 @@ static void event_fn(struct work *work)
 
 	switch (cevent->ctype) {
 	case EVENT_JOIN:
-		__sd_join(cevent);
+	case EVENT_NOTIFY:
 		break;
 	case EVENT_LEAVE:
 		__sd_leave(cevent);
-		break;
-	case EVENT_NOTIFY:
-		__sd_notify(cevent);
-		break;
 	default:
 		vprintf(SDOG_ERR, "unknown event %d\n", cevent->ctype);
 	}
@@ -1194,14 +1111,6 @@ void sd_join_handler(struct sd_node *joi
 
 		vprintf(SDOG_DEBUG, "allow new confchg %p\n", cevent);
 
-		size = sizeof(struct sd_node) * nr_members;
-		w->member_list = zalloc(size);
-		if (!w->member_list)
-			panic("failed to allocate memory");
-
-		memcpy(w->member_list, members, size);
-		w->member_list_entries = nr_members;
-
 		w->joined = *joined;
 
 		size = get_join_message_size(opaque);
Index: sheepdog/sheep/cluster.h
===================================================================
--- sheepdog.orig/sheep/cluster.h	2012-05-23 19:51:21.516139460 +0200
+++ sheepdog/sheep/cluster.h	2012-05-23 19:54:34.952144414 +0200
@@ -23,7 +23,7 @@
 #include "logger.h"
 
 /* maximum payload size sent in ->notify and ->unblock */
-#define SD_MAX_EVENT_BUF_SIZE (64 * 1024)
+#define SD_MAX_EVENT_BUF_SIZE (4096 * 1024)
 
 enum cluster_join_result {
 	CJ_RES_SUCCESS, /* Success */