[Sheepdog] [PATCH 11/18] collie: update cluster status of all the nodes
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Thu Mar 11 07:48:10 CET 2010
A master node distribute cluster status information to the all nodes.
When sheepdog can start, the master node also distribute which epoch
they will start.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
collie/collie.h | 2 +
collie/group.c | 139 +++++++++++++++++++++++++++++++++++++---------
collie/store.c | 85 ++++++++++++++++++++++++++++
include/sheepdog_proto.h | 8 +++
4 files changed, 208 insertions(+), 26 deletions(-)
diff --git a/collie/collie.h b/collie/collie.h
index f99466b..f8e0b18 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -117,8 +117,10 @@ extern struct work_queue *dobj_queue;
int epoch_log_write(uint32_t epoch, char *buf, int len);
int epoch_log_read(uint32_t epoch, char *buf, int len);
+int get_latest_epoch(void);
int set_epoch_updated(uint32_t epoch, uint32_t val);
uint32_t is_epoch_updated(uint32_t epoch);
+int remove_epoch(int epoch);
uint64_t epoch_hash_val(uint32_t epoch, uint64_t ctime,
struct sheepdog_node_list_entry *entries, int len);
int set_cluster_ctime(uint64_t ctime);
diff --git a/collie/group.c b/collie/group.c
index a74ba39..01b51f8 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -62,6 +62,13 @@ struct join_message {
uint32_t nr_sobjs;
uint32_t cluster_status;
+ uint64_t epoch_ctime;
+ uint32_t start_epoch;
+ uint16_t nr_start_epoch_logs;
+ uint16_t nr_start_nodes;
+ uint64_t start_epoch_hvals[SD_MAX_NODES];
+ struct sheepdog_node_list_entry start_nodes[SD_MAX_NODES];
+
struct {
uint32_t nodeid;
uint32_t pid;
@@ -345,11 +352,6 @@ static struct epoch_tree *find_start_epoch(struct tree_vertex *root)
return tree;
}
-struct start_epoch_info {
- int nr_logs;
- struct epoch_log *logs;
-};
-
static int __check_enough_epochs(struct tree_vertex *vertex, int depth, void *data)
{
struct epoch_tree *tree;
@@ -628,6 +630,23 @@ static void join(struct join_message *msg)
int nr_nodes = build_node_list(&sys->cpg_node_list, entries);
msg->cluster_status = get_cluster_status(entries, nr_nodes, &start_point);
dprintf("nr_nodes %d, status %d\n", nr_nodes, msg->cluster_status);
+
+ if (msg->cluster_status == SD_STATUS_OK) {
+ struct tree_vertex *v;
+ msg->epoch_ctime = start_point->ctime;
+ msg->start_epoch = start_point->epoch;
+ msg->nr_start_nodes = start_point->nr_nodes;
+ memcpy(msg->start_nodes, start_point->nodes,
+ sizeof(start_point->nodes[0]) * start_point->nr_nodes);
+
+ v = &start_point->vertex;
+ msg->nr_start_epoch_logs = 0;
+ while (v->id) {
+ msg->start_epoch_hvals[msg->nr_start_epoch_logs] = v->id;
+ msg->nr_start_epoch_logs++;
+ v = tree_parent(v);
+ }
+ }
out:
return;
}
@@ -667,31 +686,96 @@ static void update_cluster_info(struct join_message *msg)
sys->epoch = msg->epoch;
sys->synchronized = 1;
- nr_nodes = build_node_list(&sys->sd_node_list, entry);
+ eprintf("system status = %d\n", msg->cluster_status);
+ if (sys->status == SD_STATUS_OK) {
+ nr_nodes = build_node_list(&sys->sd_node_list, entry);
- ret = epoch_log_write(sys->epoch, (char *)entry,
- nr_nodes * sizeof(struct sheepdog_node_list_entry));
- if (ret < 0)
- eprintf("can't write epoch %u\n", sys->epoch);
+ dprintf("update epoch, %d, %d\n", sys->epoch, nr_nodes);
+ ret = epoch_log_write(sys->epoch, (char *)entry,
+ nr_nodes * sizeof(struct sheepdog_node_list_entry));
+ if (ret < 0)
+ eprintf("can't write epoch %u\n", sys->epoch);
+ }
- /* we are ready for object operations */
- update_epoch_store(sys->epoch);
out:
add_node(&sys->sd_node_list, msg->nodeid, msg->pid, &msg->header.from);
- nr_nodes = build_node_list(&sys->sd_node_list, entry);
+ if (sys->status == SD_STATUS_OK) {
+ nr_nodes = build_node_list(&sys->sd_node_list, entry);
- ret = epoch_log_write(sys->epoch + 1, (char *)entry,
- nr_nodes * sizeof(struct sheepdog_node_list_entry));
- if (ret < 0)
- eprintf("can't write epoch %u\n", sys->epoch + 1);
+ dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr_nodes);
+ ret = epoch_log_write(sys->epoch + 1, (char *)entry,
+ nr_nodes * sizeof(struct sheepdog_node_list_entry));
+ if (ret < 0)
+ eprintf("can't write epoch %u\n", sys->epoch + 1);
- sys->epoch++;
- sys->is_object_updated = 0;
+ sys->epoch++;
+ sys->is_object_updated = 0;
- update_epoch_store(sys->epoch);
+ update_epoch_store(sys->epoch);
+ }
print_node_list(&sys->sd_node_list);
+
+ if (sys->status & SD_STATUS_STARTUP_MASK && msg->cluster_status == SD_STATUS_OK) {
+ int i, j;
+ int latest_epoch = get_latest_epoch();
+ int size;
+ uint64_t hval;
+ int has_start_epoch_log = 0;
+
+ dprintf("latest_epoch = %d\n", latest_epoch);
+ for (i = 1; i <= latest_epoch; i++) {
+ size = epoch_log_read(i, (char *)entry, sizeof(entry));
+ if (size <= 0)
+ continue;
+ hval = epoch_hash_val(i, get_cluster_ctime(),
+ entry, size / sizeof(*entry));
+ for (j = 0; j < msg->nr_start_epoch_logs; j++) {
+ if (msg->start_epoch_hvals[j] == hval) {
+ if (j == 0)
+ has_start_epoch_log = 1;
+ goto next;
+ }
+ }
+ remove_epoch(i);
+ next:
+ ;
+ }
+ set_cluster_ctime(msg->epoch_ctime);
+ sys->epoch = msg->start_epoch;
+
+ if (!has_start_epoch_log) {
+ dprintf("write start epoch, %d, %d\n", sys->epoch, msg->nr_start_nodes);
+ ret = epoch_log_write(sys->epoch, (char *)msg->start_nodes,
+ msg->nr_start_nodes * sizeof(struct sheepdog_node_list_entry));
+ if (ret < 0)
+ eprintf("can't write epoch %u\n", sys->epoch);
+ }
+
+ nr_nodes = build_node_list(&sys->sd_node_list, entry);
+ uint64_t local_hval = epoch_hash_val(sys->epoch, msg->epoch_ctime, entry, nr_nodes);
+ uint64_t start_hval = msg->start_epoch_hvals[0];
+ dprintf("start epoch hval = %lu, local epoch hval = %lu\n", start_hval, local_hval);
+ if (start_hval != local_hval) {
+ dprintf("update epoch because current node_list is different from what it was, %d\n", sys->epoch + 1);
+ ret = epoch_log_write(sys->epoch + 1, (char *)entry,
+ nr_nodes * sizeof(struct sheepdog_node_list_entry));
+ if (ret < 0)
+ eprintf("can't write epoch %u\n", sys->epoch + 1);
+
+ sys->epoch++;
+ }
+
+ sys->is_object_updated = 0;
+ sys->status = SD_STATUS_OK;
+
+ update_epoch_store(sys->epoch);
+ }
+
+ sys->status = msg->cluster_status;
+
+ return;
}
static void vdi_op(struct vdi_op_message *msg)
@@ -944,14 +1028,17 @@ static void __sd_confch(struct work *work, int idx)
list_del(&node->list);
free(node);
- nr = build_node_list(&sys->sd_node_list, e);
- epoch_log_write(sys->epoch + 1, (char *)e,
- nr * sizeof(struct sheepdog_node_list_entry));
+ if (sys->status == SD_STATUS_OK) {
+ nr = build_node_list(&sys->sd_node_list, e);
+ dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr);
+ epoch_log_write(sys->epoch + 1, (char *)e,
+ nr * sizeof(struct sheepdog_node_list_entry));
- sys->epoch++;
- sys->is_object_updated = 0;
+ sys->epoch++;
+ sys->is_object_updated = 0;
- update_epoch_store(sys->epoch);
+ update_epoch_store(sys->epoch);
+ }
}
}
diff --git a/collie/store.c b/collie/store.c
index 59f5476..021380c 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -1068,6 +1068,26 @@ int epoch_log_read(uint32_t epoch, char *buf, int len)
return len;
}
+int get_latest_epoch(void)
+{
+ DIR *dir;
+ struct dirent *d;
+ uint32_t e, epoch = 0;
+
+ dir = opendir(epoch_path);
+ if (!dir)
+ return -1;
+
+ while ((d = readdir(dir))) {
+ e = atoi(d->d_name);
+ if (e > epoch)
+ epoch = e;
+ }
+ closedir(dir);
+
+ return epoch;
+}
+
int set_epoch_updated(uint32_t epoch, uint32_t val)
{
int fd, ret;
@@ -1108,6 +1128,71 @@ uint32_t is_epoch_updated(uint32_t epoch)
return val;
}
+/* remove directory recursively */
+static int rmdir_r(char *dir_path)
+{
+ int ret;
+ struct stat s;
+ DIR *dir;
+ struct dirent *d;
+ char path[PATH_MAX];
+
+ dir = opendir(dir_path);
+ if (!dir) {
+ if (errno != ENOENT)
+ eprintf("failed, %s, %d\n", dir_path, errno);
+ return -errno;
+ }
+
+ while ((d = readdir(dir))) {
+ if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
+ continue;
+
+ snprintf(path, sizeof(path), "%s/%s", dir_path, d->d_name);
+ ret = stat(path, &s);
+ if (ret) {
+ eprintf("cannot remove directory %s\n", path);
+ goto out;
+ }
+ if (S_ISDIR(s.st_mode))
+ ret = rmdir_r(path);
+ else
+ ret = unlink(path);
+
+ if (ret != 0) {
+ eprintf("failed, %s, %d, %d\n", path, S_ISDIR(s.st_mode), errno);
+ goto out;
+ }
+ }
+
+ ret = rmdir(dir_path);
+out:
+ closedir(dir);
+ return ret;
+}
+
+int remove_epoch(int epoch)
+{
+ int ret;
+ char path[PATH_MAX];
+
+ dprintf("remove epoch %d\n", epoch);
+ snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch);
+ ret = unlink(path);
+ if (ret && ret != -ENOENT) {
+ eprintf("failed to remove %s, %s\n", path, strerror(-ret));
+ return SD_RES_EIO;
+ }
+
+ snprintf(path, sizeof(path), "%s%08u", obj_path, epoch);
+ ret = rmdir_r(path);
+ if (ret && ret != -ENOENT) {
+ eprintf("failed to remove %s, %s\n", path, strerror(-ret));
+ return SD_RES_EIO;
+ }
+ return 0;
+}
+
uint64_t epoch_hash_val(uint32_t epoch, uint64_t ctime,
struct sheepdog_node_list_entry *entries, int len)
{
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index c28dcba..e835f68 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -276,6 +276,14 @@ struct sheepdog_node_list_entry {
uint16_t pad;
};
+struct epoch_log {
+ uint64_t ctime;
+ uint64_t hval;
+ uint32_t epoch;
+ uint32_t nr_nodes;
+ struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
+};
+
/*
* 64 bit FNV-1a non-zero initial basis
*/
--
1.5.6.5
More information about the sheepdog
mailing list