[sheepdog] [PATCH 1/4] zookeeper: fixed concurrent startup error

Kai Zhang kyle at zelin.io
Fri Jun 14 03:46:35 CEST 2013


Current implementation of zookeeper driver has a risk when multiple sheep
start up concurrently.

Consider the following situation:
1. There is a 3 node cluster: sheep1, sheep2, sheep3.
2. Both sheep1 and sheep2 leave cluster.
3. Both sheep1 and sheep2 start up after previous zookeeper session timeout.
4. Sheep3 leaves the cluster before sheep1 and sheep2 receiving join requests
   from zookeeper.
5. When sheep1 and sheep2 receive the join requests, both of them assume they
   are master due to zk_member_empty() returns true.

The new implementation can avoid this problem because sheep will assume itself
as master only if it creates master node successfully.

Signed-off-by: Kai Zhang <kyle at zelin.io>
---
 sheep/cluster/zookeeper.c |   89 ++++++++++++++++++++++++++++++---------------
 1 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 45db10a..09b14c8 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -33,7 +33,7 @@
 
 /* iterate child znodes */
 #define FOR_EACH_ZNODE(parent, path, strs)			       \
-	for (zk_get_children(parent, strs),		               \
+	for (zk_get_children(parent, strs),			       \
 		     (strs)->data += (strs)->count;		       \
 	     (strs)->count-- ?					       \
 		     snprintf(path, sizeof(path), "%s/%s", parent,     \
@@ -73,6 +73,10 @@ static size_t nr_sd_nodes;
 static struct rb_root zk_node_root = RB_ROOT;
 static pthread_rwlock_t zk_tree_lock = PTHREAD_RWLOCK_INITIALIZER;
 static LIST_HEAD(zk_block_list);
+static uatomic_bool is_master;
+static uatomic_bool stop;
+
+static void zk_compete_master(void);
 
 static struct zk_node *zk_tree_insert(struct zk_node *new)
 {
@@ -436,29 +440,6 @@ static inline int zk_master_create(void)
 			      ZOO_EPHEMERAL, NULL, 0);
 }
 
-static bool is_master(void)
-{
-	struct rb_node *n;
-	struct zk_node *zk = NULL;
-
-	if (!nr_sd_nodes) {
-		if (zk_member_empty())
-			return true;
-		else
-			return false;
-	}
-
-	for (n = rb_first(&zk_node_root); n; n = rb_next(n)) {
-		zk = rb_entry(n, struct zk_node, rb);
-		if (!zk->gone)
-			break;
-	}
-	if (zk && node_eq(&zk->node, &this_node.node))
-		return true;
-
-	return false;
-}
-
 static void zk_queue_init(void)
 {
 	zk_init_node(BASE_ZNODE);
@@ -511,6 +492,13 @@ static void zk_watcher(zhandle_t *zh, int type, int state, const char *path,
 	} else if (type == ZOO_DELETED_EVENT) {
 		struct zk_node *n;
 
+		ret = strcmp(path, MASTER_ZNONE);
+		if (ret == 0) {
+			sd_dprintf("detect master leave");
+			zk_compete_master();
+			return;
+		}
+
 		ret = sscanf(path, MEMBER_ZNODE "/%s", str);
 		if (ret != 1)
 			return;
@@ -550,6 +538,41 @@ static int add_join_event(void *msg, size_t msg_len)
 	return 0;
 }
 
+static void zk_compete_master(void)
+{
+	int rc;
+
+	if (uatomic_is_true(&is_master))
+		return;
+
+	sd_iprintf("start to compete master");
+again:
+	if (uatomic_is_true(&stop))
+		return;
+
+	rc = zk_node_exists(MASTER_ZNONE);
+	if (rc == ZOK)
+		goto out;
+	else if (rc == ZNONODE)
+		rc = zk_create_node(MASTER_ZNONE, node_to_str(&this_node.node),
+				    MAX_NODE_STR_LEN,
+				    &ZOO_OPEN_ACL_UNSAFE,
+				    ZOO_EPHEMERAL, NULL, 0);
+
+	if (rc == ZOK) {
+		uatomic_set_true(&is_master);
+		sd_iprintf("compete master successfully");
+		return;
+	} else if (rc == ZNODEEXISTS)
+		goto out;
+	else {
+		sleep(1);
+		goto again;
+	}
+out:
+	sd_iprintf("lost competition of master");
+}
+
 static int zk_join(const struct sd_node *myself,
 		   void *opaque, size_t opaque_len)
 {
@@ -565,16 +588,20 @@ static int zk_join(const struct sd_node *myself,
 		exit(1);
 	}
 
-	/* For concurrent nodes setup, we allow only one to continue */
-	while (zk_member_empty() && zk_master_create() != ZOK)
-		;/* wait */
-
+	zk_compete_master();
 	return add_join_event(opaque, opaque_len);
 }
 
 static int zk_leave(void)
 {
 	char path[PATH_MAX];
+
+	sd_iprintf("leaving from cluster");
+	uatomic_set_true(&stop);
+
+	if (uatomic_is_true(&is_master))
+		zk_delete_node(MASTER_ZNONE, -1);
+
 	snprintf(path, sizeof(path), MEMBER_ZNODE"/%s",
 			node_to_str(&this_node.node));
 	add_event(EVENT_LEAVE, &this_node, NULL, 0);
@@ -602,7 +629,7 @@ static void zk_handle_join_request(struct zk_event *ev)
 	enum cluster_join_result res;
 
 	sd_dprintf("sender: %s", node_to_str(&ev->sender.node));
-	if (!is_master()) {
+	if (!uatomic_is_true(&is_master)) {
 		/* Let's await master acking the join-request */
 		queue_pos--;
 		return;
@@ -614,7 +641,7 @@ static void zk_handle_join_request(struct zk_event *ev)
 	if (res == CJ_RES_MASTER_TRANSFER) {
 		sd_eprintf("failed to join sheepdog cluster: "
 			   "please retry when master is up");
-		add_event(EVENT_LEAVE, &this_node, NULL, 0);
+		zk_leave();
 		exit(1);
 	}
 	sd_dprintf("I'm the master now");
@@ -850,6 +877,8 @@ static int zk_init(const char *option)
 		return -1;
 	}
 
+	uatomic_set_false(&stop);
+	uatomic_set_false(&is_master);
 	zk_queue_init();
 
 	efd = eventfd(0, EFD_NONBLOCK);
-- 
1.7.1




More information about the sheepdog mailing list