A master node check the epoch tree to check whether nodes can start up or not. Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> --- collie/collie.h | 1 + collie/group.c | 173 +++++++++++++++++++++++++++++++++++++++++++++- include/sheepdog_proto.h | 13 ++++ 3 files changed, 186 insertions(+), 1 deletions(-) diff --git a/collie/collie.h b/collie/collie.h index ce8e6e4..f99466b 100644 --- a/collie/collie.h +++ b/collie/collie.h @@ -64,6 +64,7 @@ struct cluster_info { uint32_t epoch; uint32_t is_object_updated; + uint32_t status; struct list_head cpg_node_list; struct list_head sd_node_list; diff --git a/collie/group.c b/collie/group.c index 02cb2d5..a74ba39 100644 --- a/collie/group.c +++ b/collie/group.c @@ -60,7 +60,8 @@ struct join_message { uint32_t epoch; uint32_t nr_nodes; uint32_t nr_sobjs; - uint32_t pad; + uint32_t cluster_status; + struct { uint32_t nodeid; uint32_t pid; @@ -295,6 +296,170 @@ static int is_master(void) return 0; } +static int __check_split_brain(struct tree_vertex *vertex, int depth, + struct tree_vertex **updated_vertex) +{ + struct epoch_tree *tree; + tree = container_of(vertex, struct epoch_tree, vertex); + if (!tree->updated) + return 0; + + if (!*updated_vertex) { + *updated_vertex = vertex; + return 0; + } + + while (vertex) { + if (vertex == *updated_vertex) + return 0; + vertex = tree_parent(vertex); + } + return 1; +} + +static int check_split_brain(struct tree_vertex *root) +{ + struct tree_vertex *vertex = NULL; + + return tree_walk(root, (tree_vertex_func_t)__check_split_brain, &vertex); +} + +static int __find_start_epoch(struct tree_vertex *vertex, int depth, + struct epoch_tree **ptree) +{ + struct epoch_tree *tree; + tree = container_of(vertex, struct epoch_tree, vertex); + if (tree->updated) { + dprintf("new updated epoch = %ld\n", vertex->id); + *ptree = tree; + } + return 0; +} + +static struct epoch_tree *find_start_epoch(struct tree_vertex *root) +{ + struct epoch_tree *tree = NULL; + + tree_walk(root, (tree_vertex_func_t)__find_start_epoch, &tree); + + return tree; +} + +struct start_epoch_info { + int nr_logs; + struct epoch_log *logs; +}; + +static int __check_enough_epochs(struct tree_vertex *vertex, int depth, void *data) +{ + struct epoch_tree *tree; + tree = container_of(vertex, struct epoch_tree, vertex); + + if (tree->updated) + return 1; + + return 0; +} + +static int check_enough_epochs(struct tree_vertex *root) +{ + int res; + + if (tree_no_children(root)) + return SD_STATUS_NO_EPOCH; + + if (tree_first_child(root) != tree_last_child(root)) + return SD_STATUS_MULTIPLE_EPOCH_TREES; + + if (!tree_walk(root, __check_enough_epochs, &res)) + return SD_STATUS_NO_UPDATED_EPOCH; + + return SD_STATUS_OK; +} + +struct check_enough_nodes_info { + int nr_nodes; + struct sheepdog_node_list_entry *nodes; +}; + +static int __check_enough_nodes(struct tree_vertex *vertex, int depth, struct check_enough_nodes_info *ceni) +{ + int i, j; + struct epoch_tree *tree; + int nr_nodes = ceni->nr_nodes; + struct sheepdog_node_list_entry *nodes = ceni->nodes; + + tree = container_of(vertex, struct epoch_tree, vertex); + for (i = 0; i < tree->nr_nodes; i++) { + for (j = 0; j < nr_nodes; j++) { + if (nodes[j].id == tree->nodes[i].id) + goto next; + } + return 1; + next: + ; + } + + return 0; +} + +static int check_enough_nodes(struct epoch_tree *tree, struct sheepdog_node_list_entry *nodes, int nr_nodes) +{ + struct check_enough_nodes_info ceni; + + ceni.nr_nodes = nr_nodes; + ceni.nodes = nodes; + + return !tree_walk(&tree->vertex, (tree_vertex_func_t)__check_enough_nodes, &ceni); +} + +static int __print_epoch_tree(struct tree_vertex *vertex, char *buf, int len) +{ + struct epoch_tree *tree; + tree = container_of(vertex, struct epoch_tree, vertex); + snprintf(buf, len, "%016" PRIx64"(%d)", vertex->id, tree->epoch); + + return strlen(buf); +} + +static void print_epoch_tree(struct tree_vertex *root) +{ + tree_print(root, "epoch_tree", __print_epoch_tree); +} + +static int get_cluster_status(struct sheepdog_node_list_entry *entries, int nr_entries, + struct epoch_tree **start_point) +{ + struct epoch_tree *tree; + struct tree_vertex *v; + uint32_t status = sys->status; + + tree_for_each_child(v, &sys->epoch_tree_root) { + print_epoch_tree(v); + } + + status = check_enough_epochs(&sys->epoch_tree_root); + if (status == SD_STATUS_OK) { + if (check_split_brain(&sys->epoch_tree_root)) { + eprintf("Object was updated when split brain was occurred\n"); + status = SD_STATUS_EPOCH_CONFLICT; + goto out; + } + + tree = find_start_epoch(&sys->epoch_tree_root); + if (check_enough_nodes(tree, entries, nr_entries)) { + *start_point = tree; + status = SD_STATUS_OK; + } else + status = SD_STATUS_MISSING_NODES; + } else if (sys->status == SD_STATUS_OK) { + eprintf("a newly added node has incorrect epoch info\n"); + status = SD_STATUS_UNKNOWN_ERROR; + } +out: + return status; +} + static int add_epoch_log(int epoch, uint64_t parent_hval, uint64_t hval, int nr_nodes, struct sheepdog_node_list_entry *nodes, int is_updated, uint64_t ctime) @@ -333,6 +498,7 @@ static int add_epoch_log(int epoch, uint64_t parent_hval, uint64_t hval, static void join(struct join_message *msg) { struct node *node; + struct epoch_tree *start_point = NULL; struct sheepdog_node_list_entry entries[SD_MAX_NODES]; if (!sys->synchronized) @@ -458,6 +624,10 @@ static void join(struct join_message *msg) msg->nodes[msg->nr_nodes].ent = node->ent; msg->nr_nodes++; } + + int nr_nodes = build_node_list(&sys->cpg_node_list, entries); + msg->cluster_status = get_cluster_status(entries, nr_nodes, &start_point); + dprintf("nr_nodes %d, status %d\n", nr_nodes, msg->cluster_status); out: return; } @@ -1009,6 +1179,7 @@ join_retry: sys->this_node.id = hval; sys->synchronized = 0; + sys->status = SD_STATUS_NO_EPOCH; INIT_LIST_HEAD(&sys->sd_node_list); INIT_LIST_HEAD(&sys->cpg_node_list); INIT_LIST_HEAD(&sys->vm_list); diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h index c1197fb..c28dcba 100644 --- a/include/sheepdog_proto.h +++ b/include/sheepdog_proto.h @@ -65,6 +65,19 @@ #define SD_FLAG_CMD_SNAPSHOT (1U << 8) +#define SD_STATUS_STARTUP_MASK 0x20 +#define SD_STATUS_SHUTDOWN_MASK 0x40 +#define SD_STATUS_ERROR_MASK 0x80 + +#define SD_STATUS_OK 0x00 +#define SD_STATUS_NO_EPOCH 0x21 /* There is no epoch */ +#define SD_STATUS_MULTIPLE_EPOCH_TREES 0x22 /* Too many epoch tree */ +#define SD_STATUS_NO_UPDATED_EPOCH 0x23 /* There is no updated epoch */ +#define SD_STATUS_MISSING_NODES 0x24 /* Too few nodes to start sheepdog */ +#define SD_STATUS_SHUTDOWN 0x41 +#define SD_STATUS_EPOCH_CONFLICT 0x85 /* Cannot resolve where to start */ +#define SD_STATUS_UNKNOWN_ERROR 0x86 /* Unknown error has occurred */ + #define SD_RES_SUCCESS 0x00 /* Success */ #define SD_RES_UNKNOWN 0x01 /* Unknown error */ #define SD_RES_NO_OBJ 0x02 /* No object found */ -- 1.5.6.5 |