[Sheepdog] [PATCH 11/18] collie: update cluster status of all the nodes

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Thu Mar 11 07:48:10 CET 2010


A master node distribute cluster status information to the all nodes.
When sheepdog can start, the master node also distribute which epoch
they will start.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/collie.h          |    2 +
 collie/group.c           |  139 +++++++++++++++++++++++++++++++++++++---------
 collie/store.c           |   85 ++++++++++++++++++++++++++++
 include/sheepdog_proto.h |    8 +++
 4 files changed, 208 insertions(+), 26 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index f99466b..f8e0b18 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -117,8 +117,10 @@ extern struct work_queue *dobj_queue;
 
 int epoch_log_write(uint32_t epoch, char *buf, int len);
 int epoch_log_read(uint32_t epoch, char *buf, int len);
+int get_latest_epoch(void);
 int set_epoch_updated(uint32_t epoch, uint32_t val);
 uint32_t is_epoch_updated(uint32_t epoch);
+int remove_epoch(int epoch);
 uint64_t epoch_hash_val(uint32_t epoch, uint64_t ctime,
 			struct sheepdog_node_list_entry *entries, int len);
 int set_cluster_ctime(uint64_t ctime);
diff --git a/collie/group.c b/collie/group.c
index a74ba39..01b51f8 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -62,6 +62,13 @@ struct join_message {
 	uint32_t nr_sobjs;
 	uint32_t cluster_status;
 
+	uint64_t epoch_ctime;
+	uint32_t start_epoch;
+	uint16_t nr_start_epoch_logs;
+	uint16_t nr_start_nodes;
+	uint64_t start_epoch_hvals[SD_MAX_NODES];
+	struct sheepdog_node_list_entry start_nodes[SD_MAX_NODES];
+
 	struct {
 		uint32_t nodeid;
 		uint32_t pid;
@@ -345,11 +352,6 @@ static struct epoch_tree *find_start_epoch(struct tree_vertex *root)
 	return tree;
 }
 
-struct start_epoch_info {
-	int nr_logs;
-	struct epoch_log *logs;
-};
-
 static int __check_enough_epochs(struct tree_vertex *vertex, int depth, void *data)
 {
 	struct epoch_tree *tree;
@@ -628,6 +630,23 @@ static void join(struct join_message *msg)
 	int nr_nodes = build_node_list(&sys->cpg_node_list, entries);
 	msg->cluster_status = get_cluster_status(entries, nr_nodes, &start_point);
 	dprintf("nr_nodes %d, status %d\n", nr_nodes, msg->cluster_status);
+
+	if (msg->cluster_status == SD_STATUS_OK) {
+		struct tree_vertex *v;
+		msg->epoch_ctime = start_point->ctime;
+		msg->start_epoch = start_point->epoch;
+		msg->nr_start_nodes = start_point->nr_nodes;
+		memcpy(msg->start_nodes, start_point->nodes,
+		       sizeof(start_point->nodes[0]) * start_point->nr_nodes);
+
+		v = &start_point->vertex;
+		msg->nr_start_epoch_logs = 0;
+		while (v->id) {
+			msg->start_epoch_hvals[msg->nr_start_epoch_logs] = v->id;
+			msg->nr_start_epoch_logs++;
+			v = tree_parent(v);
+		}
+	}
 out:
 	return;
 }
@@ -667,31 +686,96 @@ static void update_cluster_info(struct join_message *msg)
 	sys->epoch = msg->epoch;
 	sys->synchronized = 1;
 
-	nr_nodes = build_node_list(&sys->sd_node_list, entry);
+	eprintf("system status = %d\n", msg->cluster_status);
+	if (sys->status == SD_STATUS_OK) {
+		nr_nodes = build_node_list(&sys->sd_node_list, entry);
 
-	ret = epoch_log_write(sys->epoch, (char *)entry,
-			      nr_nodes * sizeof(struct sheepdog_node_list_entry));
-	if (ret < 0)
-		eprintf("can't write epoch %u\n", sys->epoch);
+		dprintf("update epoch, %d, %d\n", sys->epoch, nr_nodes);
+		ret = epoch_log_write(sys->epoch, (char *)entry,
+				      nr_nodes * sizeof(struct sheepdog_node_list_entry));
+		if (ret < 0)
+			eprintf("can't write epoch %u\n", sys->epoch);
+	}
 
-	/* we are ready for object operations */
-	update_epoch_store(sys->epoch);
 out:
 	add_node(&sys->sd_node_list, msg->nodeid, msg->pid, &msg->header.from);
 
-	nr_nodes = build_node_list(&sys->sd_node_list, entry);
+	if (sys->status == SD_STATUS_OK) {
+		nr_nodes = build_node_list(&sys->sd_node_list, entry);
 
-	ret = epoch_log_write(sys->epoch + 1, (char *)entry,
-			      nr_nodes * sizeof(struct sheepdog_node_list_entry));
-	if (ret < 0)
-		eprintf("can't write epoch %u\n", sys->epoch + 1);
+		dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr_nodes);
+		ret = epoch_log_write(sys->epoch + 1, (char *)entry,
+				      nr_nodes * sizeof(struct sheepdog_node_list_entry));
+		if (ret < 0)
+			eprintf("can't write epoch %u\n", sys->epoch + 1);
 
-	sys->epoch++;
-	sys->is_object_updated = 0;
+		sys->epoch++;
+		sys->is_object_updated = 0;
 
-	update_epoch_store(sys->epoch);
+		update_epoch_store(sys->epoch);
+	}
 
 	print_node_list(&sys->sd_node_list);
+
+	if (sys->status & SD_STATUS_STARTUP_MASK && msg->cluster_status == SD_STATUS_OK) {
+		int i, j;
+		int latest_epoch = get_latest_epoch();
+		int size;
+		uint64_t hval;
+		int has_start_epoch_log = 0;
+
+		dprintf("latest_epoch = %d\n", latest_epoch);
+		for (i = 1; i <= latest_epoch; i++) {
+			size = epoch_log_read(i, (char *)entry, sizeof(entry));
+			if (size <= 0)
+				continue;
+			hval = epoch_hash_val(i, get_cluster_ctime(),
+					      entry, size / sizeof(*entry));
+			for (j = 0; j < msg->nr_start_epoch_logs; j++) {
+				if (msg->start_epoch_hvals[j] == hval) {
+					if (j == 0)
+						has_start_epoch_log = 1;
+					goto next;
+				}
+			}
+			remove_epoch(i);
+		next:
+			;
+		}
+		set_cluster_ctime(msg->epoch_ctime);
+		sys->epoch = msg->start_epoch;
+
+		if (!has_start_epoch_log) {
+			dprintf("write start epoch, %d, %d\n", sys->epoch, msg->nr_start_nodes);
+			ret = epoch_log_write(sys->epoch, (char *)msg->start_nodes,
+					      msg->nr_start_nodes * sizeof(struct sheepdog_node_list_entry));
+			if (ret < 0)
+				eprintf("can't write epoch %u\n", sys->epoch);
+		}
+
+		nr_nodes = build_node_list(&sys->sd_node_list, entry);
+		uint64_t local_hval = epoch_hash_val(sys->epoch, msg->epoch_ctime, entry, nr_nodes);
+		uint64_t start_hval = msg->start_epoch_hvals[0];
+		dprintf("start epoch hval = %lu, local epoch hval = %lu\n", start_hval, local_hval);
+		if (start_hval != local_hval) {
+			dprintf("update epoch because current node_list is different from what it was, %d\n", sys->epoch + 1);
+			ret = epoch_log_write(sys->epoch + 1, (char *)entry,
+					      nr_nodes * sizeof(struct sheepdog_node_list_entry));
+			if (ret < 0)
+				eprintf("can't write epoch %u\n", sys->epoch + 1);
+
+			sys->epoch++;
+		}
+
+		sys->is_object_updated = 0;
+		sys->status = SD_STATUS_OK;
+
+		update_epoch_store(sys->epoch);
+	}
+
+	sys->status = msg->cluster_status;
+
+	return;
 }
 
 static void vdi_op(struct vdi_op_message *msg)
@@ -944,14 +1028,17 @@ static void __sd_confch(struct work *work, int idx)
 			list_del(&node->list);
 			free(node);
 
-			nr = build_node_list(&sys->sd_node_list, e);
-			epoch_log_write(sys->epoch + 1, (char *)e,
-					nr * sizeof(struct sheepdog_node_list_entry));
+			if (sys->status == SD_STATUS_OK) {
+				nr = build_node_list(&sys->sd_node_list, e);
+				dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr);
+				epoch_log_write(sys->epoch + 1, (char *)e,
+						nr * sizeof(struct sheepdog_node_list_entry));
 
-			sys->epoch++;
-			sys->is_object_updated = 0;
+				sys->epoch++;
+				sys->is_object_updated = 0;
 
-			update_epoch_store(sys->epoch);
+				update_epoch_store(sys->epoch);
+			}
 		}
 	}
 
diff --git a/collie/store.c b/collie/store.c
index 59f5476..021380c 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -1068,6 +1068,26 @@ int epoch_log_read(uint32_t epoch, char *buf, int len)
 	return len;
 }
 
+int get_latest_epoch(void)
+{
+	DIR *dir;
+	struct dirent *d;
+	uint32_t e, epoch = 0;
+
+	dir = opendir(epoch_path);
+	if (!dir)
+		return -1;
+
+	while ((d = readdir(dir))) {
+		e = atoi(d->d_name);
+		if (e > epoch)
+			epoch = e;
+	}
+	closedir(dir);
+
+	return epoch;
+}
+
 int set_epoch_updated(uint32_t epoch, uint32_t val)
 {
 	int fd, ret;
@@ -1108,6 +1128,71 @@ uint32_t is_epoch_updated(uint32_t epoch)
 	return val;
 }
 
+/* remove directory recursively */
+static int rmdir_r(char *dir_path)
+{
+	int ret;
+	struct stat s;
+	DIR *dir;
+	struct dirent *d;
+	char path[PATH_MAX];
+
+	dir = opendir(dir_path);
+	if (!dir) {
+		if (errno != ENOENT)
+			eprintf("failed, %s, %d\n", dir_path, errno);
+		return -errno;
+	}
+
+	while ((d = readdir(dir))) {
+		if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
+			continue;
+
+		snprintf(path, sizeof(path), "%s/%s", dir_path, d->d_name);
+		ret = stat(path, &s);
+		if (ret) {
+			eprintf("cannot remove directory %s\n", path);
+			goto out;
+		}
+		if (S_ISDIR(s.st_mode))
+			ret = rmdir_r(path);
+		else
+			ret = unlink(path);
+
+		if (ret != 0) {
+			eprintf("failed, %s, %d, %d\n", path, S_ISDIR(s.st_mode), errno);
+			goto out;
+		}
+	}
+
+	ret = rmdir(dir_path);
+out:
+	closedir(dir);
+	return ret;
+}
+
+int remove_epoch(int epoch)
+{
+	int ret;
+	char path[PATH_MAX];
+
+	dprintf("remove epoch %d\n", epoch);
+	snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch);
+	ret = unlink(path);
+	if (ret && ret != -ENOENT) {
+		eprintf("failed to remove %s, %s\n", path, strerror(-ret));
+		return SD_RES_EIO;
+	}
+
+	snprintf(path, sizeof(path), "%s%08u", obj_path, epoch);
+	ret = rmdir_r(path);
+	if (ret && ret != -ENOENT) {
+		eprintf("failed to remove %s, %s\n", path, strerror(-ret));
+		return SD_RES_EIO;
+	}
+	return 0;
+}
+
 uint64_t epoch_hash_val(uint32_t epoch, uint64_t ctime,
 			struct sheepdog_node_list_entry *entries, int len)
 {
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index c28dcba..e835f68 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -276,6 +276,14 @@ struct sheepdog_node_list_entry {
 	uint16_t	pad;
 };
 
+struct epoch_log {
+	uint64_t ctime;
+	uint64_t hval;
+	uint32_t epoch;
+	uint32_t nr_nodes;
+	struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
+};
+
 /*
  * 64 bit FNV-1a non-zero initial basis
  */
-- 
1.5.6.5




More information about the sheepdog mailing list