[sheepdog] [PATCH 3/3] sheep: start cluster if all the nodes in the previous epoch are gathered
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Thu Jul 4 09:18:46 CEST 2013
At Thu, 4 Jul 2013 14:42:44 +0800,
Liu Yuan wrote:
>
> On Wed, Jul 03, 2013 at 03:49:23PM +0900, MORITA Kazutaka wrote:
> > From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> >
> > The current rules to start sheepdog automatically are too complex:
> >
> > - One of the node in the latest epoch becomes a master. If the
> > joining node has the larger epoch, the node becomes a new master
> > and the current master must exit and rejoin later.
> >
> > - If the joining node has a smaller epoch than the existing nodes,
> > the node will be linked to the delayed node list and rejoin
> > automatically after sheepdog starts.
> >
> > - Sheepdog starts if the current node list becomes the same one as in
> > the previous epoch.
> >
> > After this patch, we have only one rule:
> >
> > - Sheepdog starts if all the nodes in the previous epoch are
> > gathered.
> >
> > The nodes with smaller or larger epoch also can join, and all the
> > joined nodes share the same (latest) cluster_info among them. All the
> > nodes have the latest epoch, so any node can be a master. If all the
> > nodes in the previous epoch join Sheepdog, the cluster will start
> > automatically.
> >
> > This change also brings the following benefit:
> >
> > - We can remove the failed node list, the delayed node list, and the
> > CJ_RES_JOIN_LATER status since the node with smaller epoch can join
> > Sheepdog.
> >
> > - The CJ_RES_MASTER_TRANSFER status can be removed because any node
> > can be a master and no need to transfer mastership even if the node
> > with larger epoch joins Sheepdog.
> >
> > Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> > ---
> > include/internal_proto.h | 16 +--
> > include/shepherd.h | 5 +-
> > sheep/cluster.h | 2 +
> > sheep/cluster/corosync.c | 16 +--
> > sheep/cluster/local.c | 21 +--
> > sheep/cluster/shepherd.c | 32 ++---
> > sheep/cluster/zookeeper.c | 20 +--
> > sheep/group.c | 313 ++++++++-------------------------------------
> > shepherd/shepherd.c | 8 +-
> > tests/functional/001 | 13 --
> > tests/functional/001.out | 19 +--
> > tests/functional/002 | 14 --
> > tests/functional/002.out | 18 +--
> > tests/functional/003 | 14 --
> > tests/functional/003.out | 18 +--
> > tests/functional/004 | 16 ---
> > tests/functional/004.out | 39 +-----
> > tests/functional/005 | 17 ---
> > tests/functional/005.out | 45 +------
> > tests/functional/060 | 17 +--
> > 20 files changed, 105 insertions(+), 558 deletions(-)
> >
> > diff --git a/include/internal_proto.h b/include/internal_proto.h
> > index f0c55c0..c5cd76d 100644
> > --- a/include/internal_proto.h
> > +++ b/include/internal_proto.h
> > @@ -155,9 +155,7 @@ struct epoch_log {
> >
> > struct join_message {
> > uint8_t proto_ver;
> > - uint8_t __pad1[3];
> > - uint16_t nr_failed_nodes;
> > - uint16_t nr_delayed_nodes;
> > + uint8_t __pad1[7];
> > uint32_t cluster_status;
> > uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
> > uint8_t __pad2[3];
> > @@ -195,18 +193,6 @@ enum cluster_join_result {
> >
> > /* Fail to join. The joining node has an invalid epoch. */
> > CJ_RES_FAIL,
> > -
> > - /*
> > - * Fail to join. The joining node should be added after the cluster
> > - * start working.
> > - */
> > - CJ_RES_JOIN_LATER,
> > -
> > - /*
> > - * Transfer mastership. The joining node has a newer epoch, so this
> > - * node will leave the cluster (restart later).
> > - */
> > - CJ_RES_MASTER_TRANSFER,
> > };
> >
> > static inline __attribute__((used)) void __sd_epoch_format_build_bug_ons(void)
> > diff --git a/include/shepherd.h b/include/shepherd.h
> > index 5dbc64b..e1fbac1 100644
> > --- a/include/shepherd.h
> > +++ b/include/shepherd.h
> > @@ -39,8 +39,11 @@ struct sph_msg {
> >
> > struct sph_msg_join {
> > uint32_t res; /* original type: enum cluster_join_result */
> > - struct sd_node node;
> > + struct sd_node new_node;
> > uint8_t master_elected;
> > +
> > + struct sd_node nodes[SD_MAX_NODES];
> > + uint32_t nr_nodes;
> > uint8_t opaque[0];
> > };
> >
> > diff --git a/sheep/cluster.h b/sheep/cluster.h
> > index 80a701b..7665d75 100644
> > --- a/sheep/cluster.h
> > +++ b/sheep/cluster.h
> > @@ -158,6 +158,8 @@ void sd_notify_handler(const struct sd_node *sender, void *msg, size_t msg_len);
> > bool sd_block_handler(const struct sd_node *sender);
> > int sd_reconnect_handler(void);
> > enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
> > + const struct sd_node *members,
> > + size_t nr_members,
> > void *opaque);
> > void recalculate_vnodes(struct sd_node *nodes, int nr_nodes);
> >
> > diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> > index 2be978b..bf90209 100644
> > --- a/sheep/cluster/corosync.c
> > +++ b/sheep/cluster/corosync.c
> > @@ -295,28 +295,18 @@ static bool __corosync_dispatch_one(struct corosync_event *cevent)
> > /* check_join() must be called only once */
> > return false;
> >
> > - res = sd_check_join_cb(&cevent->sender.ent,
> > - cevent->msg);
> > - if (res == CJ_RES_MASTER_TRANSFER)
> > - nr_cpg_nodes = 0;
> > -
> > + build_node_list(cpg_nodes, nr_cpg_nodes, entries);
> > + res = sd_check_join_cb(&cevent->sender.ent, entries,
> > + nr_cpg_nodes, cevent->msg);
> > send_message(COROSYNC_MSG_TYPE_JOIN_RESPONSE, res,
> > &cevent->sender, cpg_nodes, nr_cpg_nodes,
> > cevent->msg, cevent->msg_len);
> >
> > - if (res == CJ_RES_MASTER_TRANSFER) {
> > - sd_eprintf("failed to join sheepdog cluster:"
> > - " please retry when master is up");
> > - exit(1);
> > - }
> > -
> > cevent->callbacked = true;
> > return false;
> > case COROSYNC_EVENT_TYPE_JOIN_RESPONSE:
> > switch (cevent->result) {
> > case CJ_RES_SUCCESS:
> > - case CJ_RES_MASTER_TRANSFER:
> > - case CJ_RES_JOIN_LATER:
> > add_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender);
> > nr_cpg_nodes++;
> > /* fall through */
> > diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
> > index 491e3ea..572aa19 100644
> > --- a/sheep/cluster/local.c
> > +++ b/sheep/cluster/local.c
> > @@ -441,31 +441,18 @@ static bool local_process_event(void)
> >
> > switch (ev->type) {
> > case EVENT_JOIN_REQUEST:
> > - res = sd_check_join_cb(&ev->sender.node, ev->buf);
> > + /* nodes[nr_nodes - 1] is a sender, so don't include it */
> > + assert(node_eq(&ev->sender.node, &nodes[nr_nodes - 1]));
> > + res = sd_check_join_cb(&ev->sender.node, nodes, nr_nodes - 1,
> > + ev->buf);
> > ev->join_result = res;
> > ev->type = EVENT_JOIN_RESPONSE;
> > msync(ev, sizeof(*ev), MS_SYNC);
> >
> > shm_queue_notify();
> >
> > - if (res == CJ_RES_MASTER_TRANSFER) {
> > - sd_eprintf("failed to join sheepdog cluster: "
> > - "please retry when master is up");
> > - shm_queue_unlock();
> > - exit(1);
> > - }
> > return false;
> > case EVENT_JOIN_RESPONSE:
> > - if (ev->join_result == CJ_RES_MASTER_TRANSFER) {
> > - /* FIXME: This code is tricky, but Sheepdog assumes that */
> > - /* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
> > - ev->nr_lnodes = 1;
> > - ev->lnodes[0] = this_node;
> > - nr_nodes = 1;
> > - nodes[0] = this_node.node;
> > - msync(ev, sizeof(*ev), MS_SYNC);
> > - }
> > -
> > sd_join_handler(&ev->sender.node, nodes, nr_nodes,
> > ev->join_result, ev->buf);
> > break;
> > diff --git a/sheep/cluster/shepherd.c b/sheep/cluster/shepherd.c
> > index 26fb005..db8336e 100644
> > --- a/sheep/cluster/shepherd.c
> > +++ b/sheep/cluster/shepherd.c
> > @@ -58,7 +58,7 @@ static int do_shepherd_join(void)
> > msg.body_len = msg_join_len;
> >
> > msg_join = xzalloc(msg_join_len);
> > - msg_join->node = this_node;
> > + msg_join->new_node = this_node;
> > memcpy(msg_join->opaque, kept_opaque, kept_opaque_len);
> >
> > ret = writev2(sph_comm_fd, &msg, msg_join, msg_join_len);
> > @@ -115,7 +115,7 @@ retry:
> > * FIXME: member change events must be ordered with nonblocked
> > * events
> > */
> > - res = sd_check_join_cb(&join->node, join->opaque);
> > + res = sd_check_join_cb(&join->new_node, NULL, 0, join->opaque);
> > if (res == CJ_RES_FAIL) {
> > sd_eprintf("sd_check_join_cb() failed");
> > exit(1);
> > @@ -161,19 +161,9 @@ retry:
> >
> > sd_iprintf("join reply arrived, nr_nodes: %d", join_reply->nr_nodes);
> >
> > - if (join_reply->res == CJ_RES_MASTER_TRANSFER) {
> > - is_master = true;
> > -
> > - /* FIXME: This code is tricky, but Sheepdog assumes that */
> > - /* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
> > - nr_nodes = 1;
> > - nodes[0] = this_node;
> > - } else {
> > - memcpy(nodes, join_reply->nodes,
> > - join_reply->nr_nodes * sizeof(struct sd_node));
> > -
> > - nr_nodes = join_reply->nr_nodes;
> > - }
> > + memcpy(nodes, join_reply->nodes,
> > + join_reply->nr_nodes * sizeof(struct sd_node));
> > + nr_nodes = join_reply->nr_nodes;
> >
> > /* FIXME: member change events must be ordered with nonblocked events */
> > sd_join_handler(&this_node, nodes, nr_nodes,
> > @@ -343,7 +333,7 @@ static void msg_new_node(struct sph_msg *rcv)
> > }
> >
> > /* FIXME: member change events must be ordered with nonblocked events */
> > - res = sd_check_join_cb(&join->node, join->opaque);
> > + res = sd_check_join_cb(&join->new_node, nodes, nr_nodes, join->opaque);
> >
> > join->res = res;
> >
> > @@ -357,12 +347,6 @@ static void msg_new_node(struct sph_msg *rcv)
> > exit(1);
> > }
> > free(join);
> > -
> > - if (res == CJ_RES_MASTER_TRANSFER) {
> > - sd_eprintf("failed to join sheepdog cluster: "
> > - "please retry when master is up");
> > - exit(1);
> > - }
> > }
> >
> > static void msg_new_node_finish(struct sph_msg *rcv)
> > @@ -380,11 +364,11 @@ static void msg_new_node_finish(struct sph_msg *rcv)
> >
> > jm = (struct join_message *)join_node_finish->opaque;
> > memcpy(nodes, join_node_finish->nodes,
> > - join_node_finish->nr_nodes * sizeof(struct sd_node));
> > + join_node_finish->nr_nodes * sizeof(struct sd_node));
> > nr_nodes = join_node_finish->nr_nodes;
> >
> > sd_iprintf("new node: %s",
> > - node_to_str(&join_node_finish->new_node));
> > + node_to_str(&join_node_finish->new_node));
> >
> > /* FIXME: member change events must be ordered with nonblocked events */
> > sd_join_handler(&join_node_finish->new_node, nodes, nr_nodes,
> > diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> > index 5ed16cf..aafc18d 100644
> > --- a/sheep/cluster/zookeeper.c
> > +++ b/sheep/cluster/zookeeper.c
> > @@ -845,15 +845,11 @@ static void zk_handle_join_request(struct zk_event *ev)
> > return;
> > }
> >
> > - res = sd_check_join_cb(&ev->sender.node, ev->buf);
> > + res = sd_check_join_cb(&ev->sender.node, sd_nodes, nr_sd_nodes,
> > + ev->buf);
> > ev->join_result = res;
> > push_join_response(ev);
> > - if (res == CJ_RES_MASTER_TRANSFER) {
> > - sd_eprintf("failed to join sheepdog cluster: "
> > - "please retry when master is up");
> > - zk_leave();
> > - exit(1);
> > - }
> > +
> > sd_dprintf("I'm the master now");
> > }
> >
> > @@ -896,19 +892,9 @@ static void zk_handle_join_response(struct zk_event *ev)
> > /* newly joined node */
> > init_node_list(ev);
> >
> > - if (ev->join_result == CJ_RES_MASTER_TRANSFER)
> > - /*
> > - * Sheepdog assumes that only one sheep is alive in
> > - * MASTER_TRANSFER scenario. So only the joining sheep is
> > - * supposed to return single node view to sd_join_handler().
> > - */
> > - zk_tree_destroy();
> > -
> > sd_dprintf("%s, %d", node_to_str(&ev->sender.node), ev->join_result);
> > switch (ev->join_result) {
> > case CJ_RES_SUCCESS:
> > - case CJ_RES_JOIN_LATER:
> > - case CJ_RES_MASTER_TRANSFER:
> > snprintf(path, sizeof(path), MEMBER_ZNODE"/%s",
> > node_to_str(&ev->sender.node));
> > if (node_eq(&ev->sender.node, &this_node.node)) {
> > diff --git a/sheep/group.c b/sheep/group.c
> > index cb87711..743b4fb 100644
> > --- a/sheep/group.c
> > +++ b/sheep/group.c
> > @@ -50,18 +50,6 @@ static main_thread(struct vnode_info *) current_vnode_info;
> > static main_thread(struct list_head *) pending_block_list;
> > static main_thread(struct list_head *) pending_notify_list;
> >
> > -/*
> > - * List of nodes that were part of the last epoch before a shutdown,
> > - * but failed to join.
> > - */
> > -static main_thread(struct list_head *) failed_nodes;
> > -
> > -/*
> > - * List of nodes that weren't part of the last epoch, but joined
> > - * before restarting the cluster.
> > - */
> > -static main_thread(struct list_head *) delayed_nodes;
> > -
> > static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
> > {
> > int nr_zones = 0, i, j;
> > @@ -341,120 +329,6 @@ static inline int get_nodes_nr_from(struct list_head *l)
> > return nr;
> > }
> >
> > -static int get_nodes_nr_epoch(uint32_t epoch)
> > -{
> > - struct sd_node nodes[SD_MAX_NODES];
> > -
> > - return epoch_log_read(epoch, nodes, sizeof(nodes));
> > -}
> > -
> > -static const struct sd_node *find_entry_list(const struct sd_node *entry,
> > - struct list_head *head)
> > -{
> > - struct node *n;
> > - list_for_each_entry(n, head, list)
> > - if (node_eq(&n->ent, entry))
> > - return entry;
> > -
> > - return NULL;
> > -
> > -}
> > -
> > -static const struct sd_node *find_entry_epoch(const struct sd_node *entry,
> > - uint32_t epoch)
> > -{
> > - struct sd_node nodes[SD_MAX_NODES];
> > - int nr;
> > -
> > - if (!epoch)
> > - return NULL;
> > -
> > - nr = epoch_log_read(epoch, nodes, sizeof(nodes));
> > -
> > - return xlfind(entry, nodes, nr, node_cmp);
> > -}
> > -
> > -/*
> > - * Add a node to the list of nodes that weren't part of the cluster before
> > - * it shut down, and thus do not count toward the nodes required to allow
> > - * an automated restart. These nodes will become part of the cluster by
> > - * the time it does get restarted.
> > - */
> > -static bool add_delayed_node(uint32_t epoch, const struct sd_node *node)
> > -{
> > - struct node *n;
> > -
> > - if (find_entry_list(node, main_thread_get(delayed_nodes)))
> > - return false;
> > - assert(!find_entry_epoch(node, epoch));
> > -
> > - n = xmalloc(sizeof(*n));
> > - n->ent = *node;
> > - list_add_tail(&n->list, main_thread_get(delayed_nodes));
> > - return true;
> > -}
> > -
> > -/*
> > - * For a node that failed to join check if was part of the original
> > - * epoch, and if so add it to the list of node expected to be present
> > - * but failing to join.
> > - */
> > -static bool add_failed_node(uint32_t epoch, const struct sd_node *node)
> > -{
> > - struct node *n;
> > -
> > - if (find_entry_list(node, main_thread_get(failed_nodes)))
> > - return false;
> > - if (!find_entry_epoch(node, epoch))
> > - return false;
> > -
> > - n = xmalloc(sizeof(*n));
> > - n->ent = *node;
> > - list_add_tail(&n->list, main_thread_get(failed_nodes));
> > - return true;
> > -}
> > -
> > -/*
> > - * Add the failed and delayed nodes in a join message to the local
> > - * lists of such nodes.
> > - */
> > -static void update_exceptional_node_list(uint32_t epoch,
> > - const struct join_message *jm)
> > -{
> > - int i;
> > -
> > - for (i = 0; i < jm->nr_failed_nodes; i++)
> > - add_failed_node(epoch, &jm->cinfo.nodes[i]);
> > - for ( ; i < jm->nr_failed_nodes + jm->nr_delayed_nodes; i++)
> > - add_delayed_node(epoch, &jm->cinfo.nodes[i]);
> > -}
> > -
> > -/* Format the lists of failed or delayed nodes into the join message. */
> > -static void format_exceptional_node_list(struct join_message *jm)
> > -{
> > - struct node *n;
> > -
> > - list_for_each_entry(n, main_thread_get(failed_nodes), list)
> > - jm->cinfo.nodes[jm->nr_failed_nodes++] = n->ent;
> > - list_for_each_entry(n, main_thread_get(delayed_nodes), list)
> > - jm->cinfo.nodes[jm->nr_failed_nodes +
> > - jm->nr_delayed_nodes++] = n->ent;
> > -}
> > -
> > -static void clear_exceptional_node_lists(void)
> > -{
> > - struct node *n, *t;
> > -
> > - list_for_each_entry_safe(n, t, main_thread_get(failed_nodes), list) {
> > - list_del(&n->list);
> > - free(n);
> > - }
> > - list_for_each_entry_safe(n, t, main_thread_get(delayed_nodes), list) {
> > - list_del(&n->list);
> > - free(n);
> > - }
> > -}
> > -
> > int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
> > time_t *timestamp, struct vnode_info *vinfo)
> > {
> > @@ -496,8 +370,6 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
> >
> > static int cluster_sanity_check(struct join_message *jm)
> > {
> > - uint32_t local_epoch = get_latest_epoch();
> > -
> > if (jm->cinfo.ctime != sys->cinfo.ctime) {
> > sd_eprintf("joining node ctime doesn't match: %"
> > PRIu64 " vs %" PRIu64, jm->cinfo.ctime,
> > @@ -505,12 +377,6 @@ static int cluster_sanity_check(struct join_message *jm)
> > return CJ_RES_FAIL;
> > }
> >
> > - if (jm->cinfo.epoch > local_epoch) {
> > - sd_eprintf("joining node epoch too large: %"
> > - PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
> > - return CJ_RES_FAIL;
> > - }
> > -
> > if (jm->cinfo.nr_copies != sys->cinfo.nr_copies) {
> > sd_eprintf("joining node nr_copies doesn't match: %u vs %u",
> > jm->cinfo.nr_copies, sys->cinfo.nr_copies);
> > @@ -526,88 +392,71 @@ static int cluster_sanity_check(struct join_message *jm)
> > return CJ_RES_SUCCESS;
> > }
> >
> > -static int cluster_wait_for_join_check(const struct sd_node *joined,
> > - struct join_message *jm)
> > +/*
> > + * Check whether enough node members are gathered.
> > + *
> > + * Sheepdog can start automatically if and only if all the members in the latest
> > + * epoch are gathered.
> > + */
> > +static bool enough_nodes_gathered(struct join_message *jm,
> > + const struct sd_node *joining,
> > + const struct sd_node *members,
> > + size_t nr_members)
> > {
> > - struct sd_node local_entries[SD_MAX_NODES];
> > - int nr, nr_local_entries, nr_failed_entries, nr_delayed_nodes;
> > - uint32_t local_epoch = get_latest_epoch();
> > - int ret;
> > - struct vnode_info *cur_vinfo;
> > + for (int i = 0; i < jm->cinfo.nr_nodes; i++) {
> > + const struct sd_node *key = jm->cinfo.nodes + i, *n;
> >
> > - if (jm->cinfo.nr_nodes == 0)
> > - return CJ_RES_JOIN_LATER;
> > -
> > - ret = cluster_sanity_check(jm);
> > - if (ret != CJ_RES_SUCCESS) {
> > - if (jm->cinfo.epoch > sys->cinfo.epoch) {
> > - sd_eprintf("transfer mastership (%d, %d)", jm->cinfo.epoch,
> > - sys->cinfo.epoch);
> > - return CJ_RES_MASTER_TRANSFER;
> > + n = xlfind(key, members, nr_members, node_cmp);
> > + if (n == NULL && !node_eq(key, joining)) {
> > + sd_dprintf("%s doesn't join yet", node_to_str(key));
> > + return false;
> > }
> > - return ret;
> > }
> >
> > - nr_local_entries = epoch_log_read(jm->cinfo.epoch, local_entries,
> > - sizeof(local_entries));
> > - if (nr_local_entries == -1)
> > - return CJ_RES_FAIL;
> > -
> > - if (jm->cinfo.epoch < local_epoch) {
> > - sd_eprintf("joining node epoch too small: %"
> > - PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
> > + sd_dprintf("all the nodes are gathered, %d, %zd", jm->cinfo.nr_nodes,
> > + nr_members);
> > + return true;
> > +}
> >
> > - if (xbsearch(joined, local_entries, nr_local_entries, node_cmp))
> > - return CJ_RES_FAIL;
> > - return CJ_RES_JOIN_LATER;
> > - }
> > +static int cluster_wait_for_join_check(const struct sd_node *joined,
> > + const struct sd_node *members,
> > + size_t nr_members,
> > + struct join_message *jm)
> > +{
> > + int ret;
> >
> > - if (jm->cinfo.nr_nodes != nr_local_entries) {
> > - sd_eprintf("epoch log entries do not match: %d vs %d",
> > - jm->cinfo.nr_nodes, nr_local_entries);
> > - return CJ_RES_FAIL;
> > + if (jm->cinfo.epoch != 0 && sys->cinfo.epoch != 0) {
> > + /* check whether joining node is valid or not */
> > + ret = cluster_sanity_check(jm);
> > + if (ret != CJ_RES_SUCCESS)
> > + return ret;
> > }
> >
> > -
> > - if (memcmp(jm->cinfo.nodes, local_entries,
> > - sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
> > + if (jm->cinfo.epoch > sys->cinfo.epoch)
> > + sys->cinfo = jm->cinfo;
> > + else if (jm->cinfo.epoch < sys->cinfo.epoch) {
> > + sd_dprintf("joining node has a smaller epoch, %" PRIu32 ", %"
> > + PRIu32, jm->cinfo.epoch, sys->cinfo.epoch);
> > + jm->cinfo = sys->cinfo;
> > + } else if (memcmp(jm->cinfo.nodes, sys->cinfo.nodes,
> > + sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
> > sd_eprintf("epoch log entries does not match");
> > return CJ_RES_FAIL;
> > }
> >
> > - cur_vinfo = main_thread_get(current_vnode_info);
> > - if (!cur_vinfo)
> > - nr = 1;
> > - else
> > - nr = cur_vinfo->nr_nodes + 1;
> > -
> > - nr_delayed_nodes = get_nodes_nr_from(main_thread_get(delayed_nodes));
> > -
> > /*
> > * If we have all members from the last epoch log in the in-memory
> > - * node list, and no new nodes joining we can set the cluster live
> > - * now without incrementing the epoch.
> > - */
> > - if (nr == nr_local_entries && !nr_delayed_nodes) {
> > - jm->cluster_status = SD_STATUS_OK;
> > - return CJ_RES_SUCCESS;
> > - }
> > -
> > - /*
> > - * If we reach the old node count, but some node failed we have to
> > - * update the epoch before setting the cluster live.
> > + * node list, we can set the cluster live now.
> > */
> > - nr_failed_entries = get_nodes_nr_from(main_thread_get(failed_nodes));
> > - if (nr_local_entries == nr + nr_failed_entries - nr_delayed_nodes) {
> > - jm->inc_epoch = 1;
> > + if (sys->cinfo.epoch > 0 &&
> > + enough_nodes_gathered(jm, joined, members, nr_members)) {
> > + if (jm->cinfo.nr_nodes < nr_members + 1)
>
> I'd suggest adding a comment here why you plus 1 onto nr_members or we'd better
> add a notice that nr_members means nr of nodes *exclude* joining node or leaving
> node. Probably we should rename nr_members as nr_nodes too.
Okay, I'll address it in the next version.
Thanks,
Kazutaka
More information about the sheepdog
mailing list