[sheepdog] [PATCH v3 1/5] zookeeper: fixed concurrent startup error

Kai Zhang kyle at zelin.io
Mon Jun 17 14:28:42 CEST 2013


Current implementation of zookeeper driver has a risk when multiple sheep
start up concurrently.

Consider the following situation:
1. There is a 3 node cluster: sheep1, sheep2, sheep3.
2. Both sheep1 and sheep2 leave cluster.
3. Both sheep1 and sheep2 start up after previous zookeeper session timeout.
4. Sheep3 leaves the cluster before sheep1 and sheep2 receiving join requests
   from zookeeper.
5. When sheep1 and sheep2 receive the join requests, both of them assume they
   are master due to zk_member_empty() returns true.

The new implementation can avoid this problem because sheep will assume itself
as master only if it creates master node successfully.

Signed-off-by: Kai Zhang <kyle at zelin.io>
---
 sheep/cluster/zookeeper.c |  101 +++++++++++++++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 30 deletions(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 45db10a..045034a 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -33,7 +33,7 @@
 
 /* iterate child znodes */
 #define FOR_EACH_ZNODE(parent, path, strs)			       \
-	for (zk_get_children(parent, strs),		               \
+	for (zk_get_children(parent, strs),			       \
 		     (strs)->data += (strs)->count;		       \
 	     (strs)->count-- ?					       \
 		     snprintf(path, sizeof(path), "%s/%s", parent,     \
@@ -73,6 +73,10 @@ static size_t nr_sd_nodes;
 static struct rb_root zk_node_root = RB_ROOT;
 static pthread_rwlock_t zk_tree_lock = PTHREAD_RWLOCK_INITIALIZER;
 static LIST_HEAD(zk_block_list);
+static uatomic_bool is_master;
+static uatomic_bool stop;
+
+static void zk_compete_master(void);
 
 static struct zk_node *zk_tree_insert(struct zk_node *new)
 {
@@ -436,29 +440,6 @@ static inline int zk_master_create(void)
 			      ZOO_EPHEMERAL, NULL, 0);
 }
 
-static bool is_master(void)
-{
-	struct rb_node *n;
-	struct zk_node *zk = NULL;
-
-	if (!nr_sd_nodes) {
-		if (zk_member_empty())
-			return true;
-		else
-			return false;
-	}
-
-	for (n = rb_first(&zk_node_root); n; n = rb_next(n)) {
-		zk = rb_entry(n, struct zk_node, rb);
-		if (!zk->gone)
-			break;
-	}
-	if (zk && node_eq(&zk->node, &this_node.node))
-		return true;
-
-	return false;
-}
-
 static void zk_queue_init(void)
 {
 	zk_init_node(BASE_ZNODE);
@@ -511,6 +492,13 @@ static void zk_watcher(zhandle_t *zh, int type, int state, const char *path,
 	} else if (type == ZOO_DELETED_EVENT) {
 		struct zk_node *n;
 
+		ret = strcmp(path, MASTER_ZNONE);
+		if (ret == 0) {
+			sd_dprintf("detect master leave");
+			zk_compete_master();
+			return;
+		}
+
 		ret = sscanf(path, MEMBER_ZNODE "/%s", str);
 		if (ret != 1)
 			return;
@@ -550,6 +538,53 @@ static int add_join_event(void *msg, size_t msg_len)
 	return 0;
 }
 
+static void zk_compete_master(void)
+{
+	int rc, len = MAX_NODE_STR_LEN;
+	char master_name[MAX_NODE_STR_LEN], master_path[MAX_NODE_STR_LEN];
+
+	if (uatomic_is_true(&is_master))
+		return;
+
+	sd_iprintf("start to compete master");
+again:
+	if (uatomic_is_true(&stop))
+		return;
+
+	/* check if master exists */
+	rc = zk_get_data(MASTER_ZNONE, master_name, &len);
+	if (rc == ZOK) {
+		/* make sure master has joined */
+		snprintf(master_path, sizeof(master_name), MEMBER_ZNODE "/%s",
+			 master_name);
+		rc = zk_node_exists(master_path);
+		if (rc == ZOK)
+			goto out;
+		else if (rc == ZNONODE)
+			goto again;
+		else
+			panic("failed, path:%s, %s", master_path, zerror(rc));
+	} else if (rc == ZNONODE) {
+		/* compete for master */
+		rc = zk_create_node(MASTER_ZNONE, node_to_str(&this_node.node),
+				    MAX_NODE_STR_LEN,
+				    &ZOO_OPEN_ACL_UNSAFE,
+				    ZOO_EPHEMERAL, NULL, 0);
+
+		if (rc == ZOK) {
+			uatomic_set_true(&is_master);
+			sd_iprintf("success");
+			return;
+		} else if (rc == ZNODEEXISTS)
+			goto again;
+		else
+			panic("failed, path:%s, %s", MASTER_ZNONE, zerror(rc));
+	} else
+		panic("failed, path:%s, %s", MASTER_ZNONE, zerror(rc));
+out:
+	sd_iprintf("lost");
+}
+
 static int zk_join(const struct sd_node *myself,
 		   void *opaque, size_t opaque_len)
 {
@@ -565,16 +600,20 @@ static int zk_join(const struct sd_node *myself,
 		exit(1);
 	}
 
-	/* For concurrent nodes setup, we allow only one to continue */
-	while (zk_member_empty() && zk_master_create() != ZOK)
-		;/* wait */
-
+	zk_compete_master();
 	return add_join_event(opaque, opaque_len);
 }
 
 static int zk_leave(void)
 {
 	char path[PATH_MAX];
+
+	sd_iprintf("leaving from cluster");
+	uatomic_set_true(&stop);
+
+	if (uatomic_is_true(&is_master))
+		zk_delete_node(MASTER_ZNONE, -1);
+
 	snprintf(path, sizeof(path), MEMBER_ZNODE"/%s",
 			node_to_str(&this_node.node));
 	add_event(EVENT_LEAVE, &this_node, NULL, 0);
@@ -602,7 +641,7 @@ static void zk_handle_join_request(struct zk_event *ev)
 	enum cluster_join_result res;
 
 	sd_dprintf("sender: %s", node_to_str(&ev->sender.node));
-	if (!is_master()) {
+	if (!uatomic_is_true(&is_master)) {
 		/* Let's await master acking the join-request */
 		queue_pos--;
 		return;
@@ -614,7 +653,7 @@ static void zk_handle_join_request(struct zk_event *ev)
 	if (res == CJ_RES_MASTER_TRANSFER) {
 		sd_eprintf("failed to join sheepdog cluster: "
 			   "please retry when master is up");
-		add_event(EVENT_LEAVE, &this_node, NULL, 0);
+		zk_leave();
 		exit(1);
 	}
 	sd_dprintf("I'm the master now");
@@ -850,6 +889,8 @@ static int zk_init(const char *option)
 		return -1;
 	}
 
+	uatomic_set_false(&stop);
+	uatomic_set_false(&is_master);
 	zk_queue_init();
 
 	efd = eventfd(0, EFD_NONBLOCK);
-- 
1.7.9.5




More information about the sheepdog mailing list