[sheepdog] [Sheepdog] [PATCH 0/4] fix a race when multiple sheep join a cluster very quickly

Christoph Hellwig hch at infradead.org
Wed May 16 19:20:23 CEST 2012


On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote:
> On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote:
> > Hi Hellwig, Do you have any comments on my latest unregister patch
> > which have been updated to reflect your previous comments?
> 
> I like it, but I ran into an issue where a VDI created on one
> nodes doesn't seem to to be found on another node created later.

The issues was that sys_stat_ok is now always true in __sd_join.  The
patch below records the state before we joined and stores it in the
join work to restore the old behavior.  It also filters the joining node
out to match the old behaviour, just as your earlier versions of the
patches did.  The patch passes all my testing, but it still introduces
the problem with I/O concurrent to epoch/node list changes vs recovery
that Kazutaka mentioned in his recent mail, so it will need more work.

Also note that it is based on top of my three patches as my working tree
has them included.


diff --git a/sheep/group.c b/sheep/group.c
index 6921f5e..8c684f7 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -9,6 +9,7 @@
  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 #include <assert.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -78,6 +79,7 @@ struct work_join {
 	struct sd_node *member_list;
 	size_t member_list_entries;
 	struct sd_node joined;
+	bool get_vdi_bitmap;
 
 	struct join_message *jm;
 };
@@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes)
 static void __sd_join(struct event_struct *cevent)
 {
 	struct work_join *w = container_of(cevent, struct work_join, cev);
-	struct join_message *msg = w->jm;
 	int i;
 
-	if (msg->cluster_status != SD_STATUS_OK &&
-	    msg->cluster_status != SD_STATUS_HALT)
-		return;
-
-	if (sys_stat_ok())
+	if (!w->get_vdi_bitmap)
 		return;
 
 	for (i = 0; i < w->member_list_entries; i++) {
-
-		get_vdi_bitmap_from(w->member_list + i);
+		if (node_eq(&w->member_list[i], &w->joined))
+			continue;
+		get_vdi_bitmap_from(&w->member_list[i]);
 
 		/*
 		 * If a new comer try to join the running cluster, it only
@@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent)
 
 	print_node_list(sys->nodes, sys->nr_nodes);
 
-	if (!sys_stat_join_failed()) {
-		update_cluster_info(jm, &w->joined, w->member_list,
-				    w->member_list_entries);
-	}
-
 	if (sys_can_recover() && jm->inc_epoch) {
 		list_for_each_entry_safe(node, t, &sys->leave_list, list) {
 			list_del(&node->list);
@@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent)
 
 static void __sd_leave_done(struct event_struct *cevent)
 {
-	struct work_leave *w = container_of(cevent, struct work_leave, cev);
-
-	update_node_info(w->member_list, w->member_list_entries);
-
-	if (sys_can_recover()) {
-		sys->epoch++;
-		update_epoch_store(sys->epoch);
-		update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
-
+	if (sys_can_recover())
 		start_recovery(sys->epoch);
-	}
 
 	if (sys_can_halt()) {
 		if (current_vnode_info->nr_zones < sys->nr_copies)
@@ -965,7 +949,6 @@ static void event_fn(struct work *work)
 static void event_done(struct work *work)
 {
 	struct event_struct *cevent;
-	int ret;
 
 	if (!sys->cur_cevent)
 		vprintf(SDOG_ERR, "bug\n");
@@ -992,9 +975,6 @@ static void event_done(struct work *work)
 	vprintf(SDOG_DEBUG, "free %p\n", cevent);
 	event_free(cevent);
 	event_running = 0;
-	ret = register_event(cdrv_fd, group_handler, NULL);
-	if (ret)
-		panic("failed to register event fd");
 
 	process_request_event_queues();
 }
@@ -1106,7 +1086,6 @@ static inline void process_event_queue(void)
 	event_work.fn = event_fn;
 	event_work.done = event_done;
 
-	unregister_event(cdrv_fd);
 	queue_work(sys->event_wqueue, &event_work);
 }
 
@@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
 	int i, size;
 	int nr, nr_local, nr_leave;
 	struct node *n;
-	struct join_message *jm;
+	struct join_message *jm = opaque;
 	uint32_t le = get_latest_epoch();
 
 	if (node_eq(joined, &sys->this_node)) {
@@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
 
 		w->joined = *joined;
 
+		/*
+		 * Only get the VDI list if we were waiting for a format or
+		 * join before, and the join was successful.
+		 */
+		if (sys_stat_ok() ||
+		    (jm->cluster_status != SD_STATUS_OK &&
+		     jm->cluster_status != SD_STATUS_HALT))
+			w->get_vdi_bitmap = false;
+		else
+			w->get_vdi_bitmap = true;
+
 		size = get_join_message_size(opaque);
 		w->jm = zalloc(size);
 		if (!w->jm)
 			panic("failed to allocate memory\n");
 		memcpy(w->jm, opaque, size);
+		
+		update_cluster_info(jm, joined, members, nr_members);
 
 		list_add_tail(&cevent->event_list, &sys->event_queue);
 		process_request_event_queues();
@@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
 		}
 		break;
 	case CJ_RES_MASTER_TRANSFER:
-		jm = (struct join_message *)opaque;
 		nr = jm->nr_leave_nodes;
 		for (i = 0; i < nr; i++) {
 			if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list)
@@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members,
 	if (sys_stat_shutdown())
 		return;
 
+	update_node_info(w->member_list, w->member_list_entries);
+
+	if (sys_can_recover()) {
+		sys->epoch++;
+		update_epoch_store(sys->epoch);
+		update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
+	}
+
 	w = zalloc(sizeof(*w));
 	if (!w)
 		goto oom;



More information about the sheepdog mailing list