A master node distribute cluster status information to the all nodes. When sheepdog can start, the master node also distribute which epoch they will start. Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> --- collie/collie.h | 2 + collie/group.c | 139 +++++++++++++++++++++++++++++++++++++--------- collie/store.c | 85 ++++++++++++++++++++++++++++ include/sheepdog_proto.h | 8 +++ 4 files changed, 208 insertions(+), 26 deletions(-) diff --git a/collie/collie.h b/collie/collie.h index f99466b..f8e0b18 100644 --- a/collie/collie.h +++ b/collie/collie.h @@ -117,8 +117,10 @@ extern struct work_queue *dobj_queue; int epoch_log_write(uint32_t epoch, char *buf, int len); int epoch_log_read(uint32_t epoch, char *buf, int len); +int get_latest_epoch(void); int set_epoch_updated(uint32_t epoch, uint32_t val); uint32_t is_epoch_updated(uint32_t epoch); +int remove_epoch(int epoch); uint64_t epoch_hash_val(uint32_t epoch, uint64_t ctime, struct sheepdog_node_list_entry *entries, int len); int set_cluster_ctime(uint64_t ctime); diff --git a/collie/group.c b/collie/group.c index a74ba39..01b51f8 100644 --- a/collie/group.c +++ b/collie/group.c @@ -62,6 +62,13 @@ struct join_message { uint32_t nr_sobjs; uint32_t cluster_status; + uint64_t epoch_ctime; + uint32_t start_epoch; + uint16_t nr_start_epoch_logs; + uint16_t nr_start_nodes; + uint64_t start_epoch_hvals[SD_MAX_NODES]; + struct sheepdog_node_list_entry start_nodes[SD_MAX_NODES]; + struct { uint32_t nodeid; uint32_t pid; @@ -345,11 +352,6 @@ static struct epoch_tree *find_start_epoch(struct tree_vertex *root) return tree; } -struct start_epoch_info { - int nr_logs; - struct epoch_log *logs; -}; - static int __check_enough_epochs(struct tree_vertex *vertex, int depth, void *data) { struct epoch_tree *tree; @@ -628,6 +630,23 @@ static void join(struct join_message *msg) int nr_nodes = build_node_list(&sys->cpg_node_list, entries); msg->cluster_status = get_cluster_status(entries, nr_nodes, &start_point); dprintf("nr_nodes %d, status %d\n", nr_nodes, msg->cluster_status); + + if (msg->cluster_status == SD_STATUS_OK) { + struct tree_vertex *v; + msg->epoch_ctime = start_point->ctime; + msg->start_epoch = start_point->epoch; + msg->nr_start_nodes = start_point->nr_nodes; + memcpy(msg->start_nodes, start_point->nodes, + sizeof(start_point->nodes[0]) * start_point->nr_nodes); + + v = &start_point->vertex; + msg->nr_start_epoch_logs = 0; + while (v->id) { + msg->start_epoch_hvals[msg->nr_start_epoch_logs] = v->id; + msg->nr_start_epoch_logs++; + v = tree_parent(v); + } + } out: return; } @@ -667,31 +686,96 @@ static void update_cluster_info(struct join_message *msg) sys->epoch = msg->epoch; sys->synchronized = 1; - nr_nodes = build_node_list(&sys->sd_node_list, entry); + eprintf("system status = %d\n", msg->cluster_status); + if (sys->status == SD_STATUS_OK) { + nr_nodes = build_node_list(&sys->sd_node_list, entry); - ret = epoch_log_write(sys->epoch, (char *)entry, - nr_nodes * sizeof(struct sheepdog_node_list_entry)); - if (ret < 0) - eprintf("can't write epoch %u\n", sys->epoch); + dprintf("update epoch, %d, %d\n", sys->epoch, nr_nodes); + ret = epoch_log_write(sys->epoch, (char *)entry, + nr_nodes * sizeof(struct sheepdog_node_list_entry)); + if (ret < 0) + eprintf("can't write epoch %u\n", sys->epoch); + } - /* we are ready for object operations */ - update_epoch_store(sys->epoch); out: add_node(&sys->sd_node_list, msg->nodeid, msg->pid, &msg->header.from); - nr_nodes = build_node_list(&sys->sd_node_list, entry); + if (sys->status == SD_STATUS_OK) { + nr_nodes = build_node_list(&sys->sd_node_list, entry); - ret = epoch_log_write(sys->epoch + 1, (char *)entry, - nr_nodes * sizeof(struct sheepdog_node_list_entry)); - if (ret < 0) - eprintf("can't write epoch %u\n", sys->epoch + 1); + dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr_nodes); + ret = epoch_log_write(sys->epoch + 1, (char *)entry, + nr_nodes * sizeof(struct sheepdog_node_list_entry)); + if (ret < 0) + eprintf("can't write epoch %u\n", sys->epoch + 1); - sys->epoch++; - sys->is_object_updated = 0; + sys->epoch++; + sys->is_object_updated = 0; - update_epoch_store(sys->epoch); + update_epoch_store(sys->epoch); + } print_node_list(&sys->sd_node_list); + + if (sys->status & SD_STATUS_STARTUP_MASK && msg->cluster_status == SD_STATUS_OK) { + int i, j; + int latest_epoch = get_latest_epoch(); + int size; + uint64_t hval; + int has_start_epoch_log = 0; + + dprintf("latest_epoch = %d\n", latest_epoch); + for (i = 1; i <= latest_epoch; i++) { + size = epoch_log_read(i, (char *)entry, sizeof(entry)); + if (size <= 0) + continue; + hval = epoch_hash_val(i, get_cluster_ctime(), + entry, size / sizeof(*entry)); + for (j = 0; j < msg->nr_start_epoch_logs; j++) { + if (msg->start_epoch_hvals[j] == hval) { + if (j == 0) + has_start_epoch_log = 1; + goto next; + } + } + remove_epoch(i); + next: + ; + } + set_cluster_ctime(msg->epoch_ctime); + sys->epoch = msg->start_epoch; + + if (!has_start_epoch_log) { + dprintf("write start epoch, %d, %d\n", sys->epoch, msg->nr_start_nodes); + ret = epoch_log_write(sys->epoch, (char *)msg->start_nodes, + msg->nr_start_nodes * sizeof(struct sheepdog_node_list_entry)); + if (ret < 0) + eprintf("can't write epoch %u\n", sys->epoch); + } + + nr_nodes = build_node_list(&sys->sd_node_list, entry); + uint64_t local_hval = epoch_hash_val(sys->epoch, msg->epoch_ctime, entry, nr_nodes); + uint64_t start_hval = msg->start_epoch_hvals[0]; + dprintf("start epoch hval = %lu, local epoch hval = %lu\n", start_hval, local_hval); + if (start_hval != local_hval) { + dprintf("update epoch because current node_list is different from what it was, %d\n", sys->epoch + 1); + ret = epoch_log_write(sys->epoch + 1, (char *)entry, + nr_nodes * sizeof(struct sheepdog_node_list_entry)); + if (ret < 0) + eprintf("can't write epoch %u\n", sys->epoch + 1); + + sys->epoch++; + } + + sys->is_object_updated = 0; + sys->status = SD_STATUS_OK; + + update_epoch_store(sys->epoch); + } + + sys->status = msg->cluster_status; + + return; } static void vdi_op(struct vdi_op_message *msg) @@ -944,14 +1028,17 @@ static void __sd_confch(struct work *work, int idx) list_del(&node->list); free(node); - nr = build_node_list(&sys->sd_node_list, e); - epoch_log_write(sys->epoch + 1, (char *)e, - nr * sizeof(struct sheepdog_node_list_entry)); + if (sys->status == SD_STATUS_OK) { + nr = build_node_list(&sys->sd_node_list, e); + dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr); + epoch_log_write(sys->epoch + 1, (char *)e, + nr * sizeof(struct sheepdog_node_list_entry)); - sys->epoch++; - sys->is_object_updated = 0; + sys->epoch++; + sys->is_object_updated = 0; - update_epoch_store(sys->epoch); + update_epoch_store(sys->epoch); + } } } diff --git a/collie/store.c b/collie/store.c index 59f5476..021380c 100644 --- a/collie/store.c +++ b/collie/store.c @@ -1068,6 +1068,26 @@ int epoch_log_read(uint32_t epoch, char *buf, int len) return len; } +int get_latest_epoch(void) +{ + DIR *dir; + struct dirent *d; + uint32_t e, epoch = 0; + + dir = opendir(epoch_path); + if (!dir) + return -1; + + while ((d = readdir(dir))) { + e = atoi(d->d_name); + if (e > epoch) + epoch = e; + } + closedir(dir); + + return epoch; +} + int set_epoch_updated(uint32_t epoch, uint32_t val) { int fd, ret; @@ -1108,6 +1128,71 @@ uint32_t is_epoch_updated(uint32_t epoch) return val; } +/* remove directory recursively */ +static int rmdir_r(char *dir_path) +{ + int ret; + struct stat s; + DIR *dir; + struct dirent *d; + char path[PATH_MAX]; + + dir = opendir(dir_path); + if (!dir) { + if (errno != ENOENT) + eprintf("failed, %s, %d\n", dir_path, errno); + return -errno; + } + + while ((d = readdir(dir))) { + if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, "..")) + continue; + + snprintf(path, sizeof(path), "%s/%s", dir_path, d->d_name); + ret = stat(path, &s); + if (ret) { + eprintf("cannot remove directory %s\n", path); + goto out; + } + if (S_ISDIR(s.st_mode)) + ret = rmdir_r(path); + else + ret = unlink(path); + + if (ret != 0) { + eprintf("failed, %s, %d, %d\n", path, S_ISDIR(s.st_mode), errno); + goto out; + } + } + + ret = rmdir(dir_path); +out: + closedir(dir); + return ret; +} + +int remove_epoch(int epoch) +{ + int ret; + char path[PATH_MAX]; + + dprintf("remove epoch %d\n", epoch); + snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); + ret = unlink(path); + if (ret && ret != -ENOENT) { + eprintf("failed to remove %s, %s\n", path, strerror(-ret)); + return SD_RES_EIO; + } + + snprintf(path, sizeof(path), "%s%08u", obj_path, epoch); + ret = rmdir_r(path); + if (ret && ret != -ENOENT) { + eprintf("failed to remove %s, %s\n", path, strerror(-ret)); + return SD_RES_EIO; + } + return 0; +} + uint64_t epoch_hash_val(uint32_t epoch, uint64_t ctime, struct sheepdog_node_list_entry *entries, int len) { diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h index c28dcba..e835f68 100644 --- a/include/sheepdog_proto.h +++ b/include/sheepdog_proto.h @@ -276,6 +276,14 @@ struct sheepdog_node_list_entry { uint16_t pad; }; +struct epoch_log { + uint64_t ctime; + uint64_t hval; + uint32_t epoch; + uint32_t nr_nodes; + struct sheepdog_node_list_entry nodes[SD_MAX_NODES]; +}; + /* * 64 bit FNV-1a non-zero initial basis */ -- 1.5.6.5 |