[Sheepdog] [PATCH, RFC] sheep: factor node list updates

Christoph Hellwig hch at infradead.org
Fri May 11 16:30:42 CEST 2012


Add a new helper that updates sys->nodes and sys->nr_nodes as well as the
vnode list, and use it in all places that update cluster membership.

In __sd_leave_done it can be used as is and gives a nice cleanup, in the
master transfer case sd_join_handler it can also be used as is, but I've
added an assert for the previously implicit assumption that no other nodes
can exist.  The tricky case is update_cluster_info/finish_join, and I'm
a bit uneasy about that one as I don't fully understand what the current
version of that code does: it goes to great extent to skip the joining
node when it first updates the node list, the writes the old list to the
epoch log and only updates it later with the joining node.  Looking at
git history I can't find a good reason for this behaviour that basically
writes out a stale version of the epoch log, but if there is a good reason
it should be documented in the code, and we should add a version of
update_epoch_log that allows writing it to disk without the need of
writing it into sys->nodes first.

Signed-off-by: Christoph Hellwig <hch at lst.de>

diff --git a/sheep/group.c b/sheep/group.c
index f4ba663..cbb1871 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -521,6 +521,17 @@ static void get_vdi_bitmap_from_sd_list(void)
 		get_vdi_bitmap_from(sys->nodes + i);
 }
 
+static void update_node_info(struct sd_node *nodes, size_t nr_nodes)
+{
+	print_node_list(nodes, nr_nodes);
+
+	sys->nr_nodes = nr_nodes;
+	memcpy(sys->nodes, nodes, sizeof(*sys->nodes) * sys->nr_nodes);
+	qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
+
+	update_vnode_info();
+}
+
 static void finish_join(struct join_message *msg, struct sd_node *joined,
 		struct sd_node *nodes, size_t nr_nodes)
 {
@@ -529,15 +540,6 @@ static void finish_join(struct join_message *msg, struct sd_node *joined,
 	sys->nr_copies = msg->nr_copies;
 	sys->epoch = msg->epoch;
 
-	/* add nodes execept for newly joined one */
-	for (i = 0; i < nr_nodes; i++) {
-		if (node_cmp(nodes + i, joined) == 0)
-			continue;
-
-		sys->nodes[sys->nr_nodes++] = nodes[i];
-	}
-	qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
-
 	if (msg->cluster_status != SD_STATUS_OK) {
 		int nr_leave_nodes;
 		uint32_t le;
@@ -559,13 +561,9 @@ static void finish_join(struct join_message *msg, struct sd_node *joined,
 			list_add_tail(&n->list, &sys->leave_list);
 		}
 	}
-
+	
 	sys->join_finished = 1;
 
-	if ((msg->cluster_status == SD_STATUS_OK ||
-	     msg->cluster_status == SD_STATUS_HALT) && msg->inc_epoch)
-		update_epoch_log(sys->epoch);
-
 	if (!sd_store && strlen((char *)msg->store)) {
 		sd_store = find_store_driver((char *)msg->store);
 		if (sd_store) {
@@ -589,8 +587,7 @@ static void update_cluster_info(struct join_message *msg,
 	if (!sys->join_finished)
 		finish_join(msg, joined, nodes, nr_nodes);
 
-	sys->nodes[sys->nr_nodes++] = *joined;
-	qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
+	update_node_info(nodes, nr_nodes);
 
 	if (msg->cluster_status == SD_STATUS_OK ||
 	    msg->cluster_status == SD_STATUS_HALT) {
@@ -606,10 +603,7 @@ static void update_cluster_info(struct join_message *msg,
 			set_cluster_ctime(msg->ctime);
 		}
 	}
-	update_vnode_info();
 	sys_stat_set(msg->cluster_status);
-
-	print_node_list(sys->nodes, sys->nr_nodes);
 }
 
 static void __sd_notify(struct event_struct *cevent)
@@ -854,21 +848,15 @@ static void __sd_leave_done(struct event_struct *cevent)
 {
 	struct work_leave *w = container_of(cevent, struct work_leave, cev);
 
-	sys->nr_nodes = w->member_list_entries;
-	memcpy(sys->nodes, w->member_list, sizeof(*sys->nodes) * sys->nr_nodes);
-	qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
+	update_node_info(w->member_list, w->member_list_entries);
 
 	if (sys_can_recover()) {
 		sys->epoch++;
 		update_epoch_store(sys->epoch);
 		update_epoch_log(sys->epoch);
-	}
-	update_vnode_info();
-
-	print_node_list(sys->nodes, sys->nr_nodes);
 
-	if (sys_can_recover())
 		start_recovery(sys->epoch);
+	}
 
 	if (sys_can_halt()) {
 		if (current_vnode_info->nr_zones < sys->nr_copies)
@@ -1196,12 +1184,8 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
 		 * Now mastership transfer is done.
 		 */
 		if (!sys->join_finished) {
-			sys->join_finished = 1;
-			sys->nodes[sys->nr_nodes++] = sys->this_node;
-			qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp);
-			sys->epoch = get_latest_epoch();
-
-			update_vnode_info();
+			assert(sys->nr_nodes == 0);
+			update_node_info(&sys->this_node, 1);
 		}
 
 		nr_local = get_nodes_nr_epoch(sys->epoch);



More information about the sheepdog mailing list