[sheepdog] [PATCH] zookeeper: allow non-master nodes to call sd_join_handler

MORITA Kazutaka morita.kazutaka at gmail.com
Thu Jul 25 06:26:06 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

Master election is still necessary to handle concurrent join when
sheepdog starts up.  I wrote it as a TODO in the source code.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---

Hi Kyle,

This patch was previously in my patch "sheep: remove master node".  If
you have another approach to remove a master from the zk driver,
please ignore this one.

Thanks,

Kazutaka

 sheep/cluster/zookeeper.c |   75 ++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 6ad694b..959dd49 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -72,15 +72,11 @@ static struct sd_node sd_nodes[SD_MAX_NODES];
 static size_t nr_sd_nodes;
 static struct rb_root zk_node_root = RB_ROOT;
 static pthread_rwlock_t zk_tree_lock = PTHREAD_RWLOCK_INITIALIZER;
-static pthread_rwlock_t zk_compete_master_lock = PTHREAD_RWLOCK_INITIALIZER;
 static LIST_HEAD(zk_block_list);
-static uatomic_bool is_master;
-static uatomic_bool stop;
+static bool first_member;
 static bool joined;
 static bool first_push = true;
 
-static void zk_compete_master(void);
-
 static struct zk_node *zk_tree_insert(struct zk_node *new)
 {
 	struct rb_node **p = &zk_node_root.rb_node;
@@ -562,12 +558,6 @@ static void zk_watcher(zhandle_t *zh, int type, int state, const char *path,
 	} else if (type == ZOO_DELETED_EVENT) {
 		struct zk_node *n;
 
-		ret = sscanf(path, MASTER_ZNONE "/%s", str);
-		if (ret == 1) {
-			zk_compete_master();
-			return;
-		}
-
 		ret = sscanf(path, MEMBER_ZNODE "/%s", str);
 		if (ret != 1)
 			return;
@@ -719,27 +709,16 @@ static int zk_verify_last_sheep_join(int seq, int *last_sheep)
  * Create sequential node under MASTER_ZNODE.
  * Sheep with least sequential number win the competition.
  */
-static void zk_compete_master(void)
+static bool zk_compete_master(void)
 {
 	int rc, last_joined_sheep;
 	char master_name[MAX_NODE_STR_LEN];
 	char my_compete_path[MAX_NODE_STR_LEN];
 	static int master_seq = -1, my_seq;
 
-	/*
-	 * This is to protect master_seq and my_seq because this function will
-	 * be called by both main thread and zookeeper's event thread.
-	 */
-	pthread_rwlock_wrlock(&zk_compete_master_lock);
-
-	if (uatomic_is_true(&is_master) || uatomic_is_true(&stop))
-		goto out_unlock;
-
 	if (!joined) {
 		sd_dprintf("start to compete master for the first time");
 		do {
-			if (uatomic_is_true(&stop))
-				goto out_unlock;
 			/* duplicate sequential node has no side-effect */
 			rc = zk_create_seq_node(MASTER_ZNONE "/",
 						node_to_str(&this_node.node),
@@ -749,7 +728,7 @@ static void zk_compete_master(void)
 		} while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
 		CHECK_ZK_RC(rc, MASTER_ZNONE "/");
 		if (rc != ZOK)
-			goto out_unlock;
+			return false;
 
 		sd_dprintf("my compete path: %s", my_compete_path);
 		sscanf(my_compete_path, MASTER_ZNONE "/%"PRId32,
@@ -757,17 +736,17 @@ static void zk_compete_master(void)
 	}
 
 	if (zk_find_master(&master_seq, master_name) != ZOK)
-		goto out_unlock;
+		return false;
 
 	if (!strcmp(master_name, node_to_str(&this_node.node)))
 		goto success;
 	else if (joined) {
 		sd_dprintf("lost");
-		goto out_unlock;
+		return false;
 	} else {
 		if (zk_verify_last_sheep_join(my_seq,
 					      &last_joined_sheep) != ZOK)
-			goto out_unlock;
+			return false;
 
 		if (last_joined_sheep < 0) {
 			/* all previous sheep has quit, i'm master */
@@ -775,14 +754,20 @@ static void zk_compete_master(void)
 			goto success;
 		} else {
 			sd_dprintf("lost");
-			goto out_unlock;
+			return false;
 		}
 	}
 success:
-	uatomic_set_true(&is_master);
 	sd_dprintf("success");
-out_unlock:
-	pthread_rwlock_unlock(&zk_compete_master_lock);
+	return true;
+}
+
+static int zk_member_empty(void)
+{
+	struct String_vector strs;
+
+	zk_get_children(MEMBER_ZNODE, &strs);
+	return (strs.count == 0);
 }
 
 static int zk_join(const struct sd_node *myself,
@@ -800,7 +785,9 @@ static int zk_join(const struct sd_node *myself,
 		exit(1);
 	}
 
-	zk_compete_master();
+	if (zk_member_empty() && zk_compete_master())
+		first_member = true;
+
 	RETURN_IF_ERROR(add_join_event(opaque, opaque_len), "");
 
 	return ZOK;
@@ -811,7 +798,6 @@ static int zk_leave(void)
 	char path[PATH_MAX];
 
 	sd_iprintf("leaving from cluster");
-	uatomic_set_true(&stop);
 
 	snprintf(path, sizeof(path), MEMBER_ZNODE"/%s",
 		 node_to_str(&this_node.node));
@@ -838,16 +824,24 @@ static int zk_unblock(void *msg, size_t msg_len)
 static void zk_handle_join(struct zk_event *ev)
 {
 	sd_dprintf("sender: %s", node_to_str(&ev->sender.node));
-	if (!uatomic_is_true(&is_master)) {
-		/* Let's await master acking the join-request */
+	if (!first_member && node_eq(&ev->sender.node, &this_node.node)) {
+		/*
+		 * This node doesn't have sd_nodes yet.  Let's await another
+		 * acking the join-request.
+		 *
+		 * TODO: If the first member can have a valid sd_node, the node
+		 * can call sd_join_handler and we can remove a master from this
+		 * driver.
+		 */
 		queue_pos--;
 		return;
 	}
 
-	sd_join_handler(&ev->sender.node, sd_nodes, nr_sd_nodes, ev->buf);
-	push_join_response(ev);
-
-	sd_dprintf("I'm the master now");
+	if (sd_join_handler(&ev->sender.node, sd_nodes, nr_sd_nodes, ev->buf))
+		push_join_response(ev);
+	else
+		/* Let's await another acking the join-request */
+		queue_pos--;
 }
 
 static void watch_all_nodes(void)
@@ -1060,6 +1054,7 @@ static void zk_event_handler(int listen_fd, int events, void *data)
 		sd_eprintf("detect a session timeout. reconnecting...");
 		handle_session_expire();
 		sd_iprintf("reconnected");
+		first_member = false;
 		eventfd_write(efd, 1);
 		return;
 	}
@@ -1120,8 +1115,6 @@ static int zk_init(const char *option)
 		return -1;
 	}
 
-	uatomic_set_false(&stop);
-	uatomic_set_false(&is_master);
 	if (zk_queue_init() != ZOK)
 		return -1;
 
-- 
1.7.9.5




More information about the sheepdog mailing list