[Sheepdog] [PATCH 10/18] collie: verify epoch tree at the master node

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Thu Mar 11 07:48:09 CET 2010


A master node check the epoch tree to check whether nodes can start up or not.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/collie.h          |    1 +
 collie/group.c           |  173 +++++++++++++++++++++++++++++++++++++++++++++-
 include/sheepdog_proto.h |   13 ++++
 3 files changed, 186 insertions(+), 1 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index ce8e6e4..f99466b 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -64,6 +64,7 @@ struct cluster_info {
 
 	uint32_t epoch;
 	uint32_t is_object_updated;
+	uint32_t status;
 
 	struct list_head cpg_node_list;
 	struct list_head sd_node_list;
diff --git a/collie/group.c b/collie/group.c
index 02cb2d5..a74ba39 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -60,7 +60,8 @@ struct join_message {
 	uint32_t epoch;
 	uint32_t nr_nodes;
 	uint32_t nr_sobjs;
-	uint32_t pad;
+	uint32_t cluster_status;
+
 	struct {
 		uint32_t nodeid;
 		uint32_t pid;
@@ -295,6 +296,170 @@ static int is_master(void)
 	return 0;
 }
 
+static int __check_split_brain(struct tree_vertex *vertex, int depth,
+			       struct tree_vertex **updated_vertex)
+{
+	struct epoch_tree *tree;
+	tree = container_of(vertex, struct epoch_tree, vertex);
+	if (!tree->updated)
+		return 0;
+
+	if (!*updated_vertex) {
+		*updated_vertex = vertex;
+		return 0;
+	}
+
+	while (vertex) {
+		if (vertex == *updated_vertex)
+			return 0;
+		vertex = tree_parent(vertex);
+	}
+	return 1;
+}
+
+static int check_split_brain(struct tree_vertex *root)
+{
+	struct tree_vertex *vertex = NULL;
+
+	return tree_walk(root, (tree_vertex_func_t)__check_split_brain, &vertex);
+}
+
+static int __find_start_epoch(struct tree_vertex *vertex, int depth,
+			      struct epoch_tree **ptree)
+{
+	struct epoch_tree *tree;
+	tree = container_of(vertex, struct epoch_tree, vertex);
+	if (tree->updated) {
+		dprintf("new updated epoch = %ld\n", vertex->id);
+		*ptree = tree;
+	}
+	return 0;
+}
+
+static struct epoch_tree *find_start_epoch(struct tree_vertex *root)
+{
+	struct epoch_tree *tree = NULL;
+
+	tree_walk(root, (tree_vertex_func_t)__find_start_epoch, &tree);
+
+	return tree;
+}
+
+struct start_epoch_info {
+	int nr_logs;
+	struct epoch_log *logs;
+};
+
+static int __check_enough_epochs(struct tree_vertex *vertex, int depth, void *data)
+{
+	struct epoch_tree *tree;
+	tree = container_of(vertex, struct epoch_tree, vertex);
+
+	if (tree->updated)
+		return 1;
+
+	return 0;
+}
+
+static int check_enough_epochs(struct tree_vertex *root)
+{
+	int res;
+
+	if (tree_no_children(root))
+		return SD_STATUS_NO_EPOCH;
+
+	if (tree_first_child(root) != tree_last_child(root))
+		return SD_STATUS_MULTIPLE_EPOCH_TREES;
+
+	if (!tree_walk(root, __check_enough_epochs, &res))
+		return SD_STATUS_NO_UPDATED_EPOCH;
+
+	return SD_STATUS_OK;
+}
+
+struct check_enough_nodes_info {
+	int nr_nodes;
+	struct sheepdog_node_list_entry *nodes;
+};
+
+static int __check_enough_nodes(struct tree_vertex *vertex, int depth, struct check_enough_nodes_info *ceni)
+{
+	int i, j;
+	struct epoch_tree *tree;
+	int nr_nodes = ceni->nr_nodes;
+	struct sheepdog_node_list_entry *nodes = ceni->nodes;
+
+	tree = container_of(vertex, struct epoch_tree, vertex);
+	for (i = 0; i < tree->nr_nodes; i++) {
+		for (j = 0; j < nr_nodes; j++) {
+			if (nodes[j].id == tree->nodes[i].id)
+				goto next;
+		}
+		return 1;
+	next:
+		;
+	}
+
+	return 0;
+}
+
+static int check_enough_nodes(struct epoch_tree *tree, struct sheepdog_node_list_entry *nodes, int nr_nodes)
+{
+	struct check_enough_nodes_info ceni;
+
+	ceni.nr_nodes = nr_nodes;
+	ceni.nodes = nodes;
+
+	return !tree_walk(&tree->vertex, (tree_vertex_func_t)__check_enough_nodes, &ceni);
+}
+
+static int __print_epoch_tree(struct tree_vertex *vertex, char *buf, int len)
+{
+	struct epoch_tree *tree;
+	tree = container_of(vertex, struct epoch_tree, vertex);
+	snprintf(buf, len, "%016" PRIx64"(%d)", vertex->id, tree->epoch);
+
+	return strlen(buf);
+}
+
+static void print_epoch_tree(struct tree_vertex *root)
+{
+	tree_print(root, "epoch_tree", __print_epoch_tree);
+}
+
+static int get_cluster_status(struct sheepdog_node_list_entry *entries, int nr_entries,
+			      struct epoch_tree **start_point)
+{
+	struct epoch_tree *tree;
+	struct tree_vertex *v;
+	uint32_t status = sys->status;
+
+	tree_for_each_child(v, &sys->epoch_tree_root) {
+		print_epoch_tree(v);
+	}
+
+	status = check_enough_epochs(&sys->epoch_tree_root);
+	if (status == SD_STATUS_OK) {
+		if (check_split_brain(&sys->epoch_tree_root)) {
+			eprintf("Object was updated when split brain was occurred\n");
+			status = SD_STATUS_EPOCH_CONFLICT;
+			goto out;
+		}
+
+		tree = find_start_epoch(&sys->epoch_tree_root);
+		if (check_enough_nodes(tree, entries, nr_entries)) {
+			*start_point = tree;
+			status =  SD_STATUS_OK;
+		} else
+			status = SD_STATUS_MISSING_NODES;
+	} else if (sys->status == SD_STATUS_OK) {
+		eprintf("a newly added node has incorrect epoch info\n");
+		status = SD_STATUS_UNKNOWN_ERROR;
+	}
+out:
+	return status;
+}
+
 static int add_epoch_log(int epoch, uint64_t parent_hval, uint64_t hval,
 			 int nr_nodes, struct sheepdog_node_list_entry *nodes,
 			 int is_updated, uint64_t ctime)
@@ -333,6 +498,7 @@ static int add_epoch_log(int epoch, uint64_t parent_hval, uint64_t hval,
 static void join(struct join_message *msg)
 {
 	struct node *node;
+	struct epoch_tree *start_point = NULL;
 	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
 
 	if (!sys->synchronized)
@@ -458,6 +624,10 @@ static void join(struct join_message *msg)
 		msg->nodes[msg->nr_nodes].ent = node->ent;
 		msg->nr_nodes++;
 	}
+
+	int nr_nodes = build_node_list(&sys->cpg_node_list, entries);
+	msg->cluster_status = get_cluster_status(entries, nr_nodes, &start_point);
+	dprintf("nr_nodes %d, status %d\n", nr_nodes, msg->cluster_status);
 out:
 	return;
 }
@@ -1009,6 +1179,7 @@ join_retry:
 	sys->this_node.id = hval;
 
 	sys->synchronized = 0;
+	sys->status = SD_STATUS_NO_EPOCH;
 	INIT_LIST_HEAD(&sys->sd_node_list);
 	INIT_LIST_HEAD(&sys->cpg_node_list);
 	INIT_LIST_HEAD(&sys->vm_list);
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index c1197fb..c28dcba 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -65,6 +65,19 @@
 
 #define SD_FLAG_CMD_SNAPSHOT (1U << 8)
 
+#define SD_STATUS_STARTUP_MASK  0x20
+#define SD_STATUS_SHUTDOWN_MASK 0x40
+#define SD_STATUS_ERROR_MASK    0x80
+
+#define SD_STATUS_OK            0x00
+#define SD_STATUS_NO_EPOCH      0x21 /* There is no epoch */
+#define SD_STATUS_MULTIPLE_EPOCH_TREES  0x22 /* Too many epoch tree */
+#define SD_STATUS_NO_UPDATED_EPOCH      0x23 /* There is no updated epoch */
+#define SD_STATUS_MISSING_NODES 0x24 /* Too few nodes to start sheepdog */
+#define SD_STATUS_SHUTDOWN      0x41
+#define SD_STATUS_EPOCH_CONFLICT    0x85 /* Cannot resolve where to start */
+#define SD_STATUS_UNKNOWN_ERROR 0x86 /* Unknown error has occurred */
+
 #define SD_RES_SUCCESS       0x00 /* Success */
 #define SD_RES_UNKNOWN       0x01 /* Unknown error */
 #define SD_RES_NO_OBJ        0x02 /* No object found */
-- 
1.5.6.5




More information about the sheepdog mailing list