On Fri, 26 Mar 2010 10:17:19 +0900 FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp> wrote: > We use the super object to manage the list of the existing VDIs. > > Unlike data objects, the super object is a directory and the directory > is replicated on multiple nodes. I concluded that the super object > recovery code is too tricky and complicated. > > So this patchset removes the super object. We manage the list of the > existing VDIs like p2p applications. At startup, the nodes build the > list of VDIs. > > TODO: restart support Done. This can be applied on the top of the patchset. = From: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp> Subject: [PATCH] add reboot support without the super object Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp> --- collie/group.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++---- collie/net.c | 6 ++++- collie/store.c | 52 +++++++++++++++++++++++++++++++++++----- include/meta.h | 5 ++++ lib/net.c | 5 +++- shepherd/shepherd.c | 2 + 6 files changed, 120 insertions(+), 14 deletions(-) diff --git a/collie/group.c b/collie/group.c index 836de83..2067870 100644 --- a/collie/group.c +++ b/collie/group.c @@ -199,9 +199,6 @@ void cluster_queue_request(struct work *work, int idx) rsp->result = SD_RES_SUCCESS; break; - case SD_OP_READ_VDIS: - rsp->result = read_vdis(req->data, hdr->data_length, &rsp->data_length); - break; default: /* forward request to group */ goto forward; @@ -431,6 +428,58 @@ static void join(struct join_message *msg) msg->cluster_status = sys->status; } +static void get_vdi_bitmap_from_all(void) +{ + struct sd_req hdr; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + int i, j, ret, nr_nodes, fd; + /* fixme: we need this until starting up. */ + static DECLARE_BITMAP(tmp_vdi_inuse, SD_NR_VDIS); + struct sheepdog_node_list_entry entry[SD_MAX_NODES]; + unsigned int rlen, wlen; + char host[128]; + + /* + * we don't need the proper order but this is the simplest + * way. + */ + nr_nodes = build_node_list(&sys->sd_node_list, entry); + + for (i = 0; i < nr_nodes; i++) { + if (!memcmp(&sys->this_node, &entry[i], sizeof(sys->this_node))) + continue; + + addr_to_str(host, sizeof(host), entry[i].addr, 0); + + fd = connect_to(host, entry[i].port); + if (fd < 0) { + vprintf(SDOG_ERR "can't get the vdi bitmap %s, %m\n", host); + } + + vprintf(SDOG_ERR "get the vdi bitmap %d %s\n", i, host); + + memset(&hdr, 0, sizeof(hdr)); + hdr.opcode = SD_OP_READ_VDIS; + hdr.epoch = sys->epoch; + hdr.data_length = sizeof(tmp_vdi_inuse); + rlen = hdr.data_length; + wlen = 0; + + ret = exec_req(fd, &hdr, (char *)tmp_vdi_inuse, + &wlen, &rlen); + + close(fd); + + if (ret || rsp->result != SD_RES_SUCCESS) { + vprintf(SDOG_ERR "can't get the vdi bitmap %d %d\n", ret, + rsp->result); + } + + for (j = 0; j < ARRAY_SIZE(sys->vdi_inuse); j++) + sys->vdi_inuse[j] |= tmp_vdi_inuse[j]; + } +} + static void update_cluster_info(struct join_message *msg) { int i; @@ -498,9 +547,14 @@ out: if (sys->status == SD_STATUS_STARTUP && msg->cluster_status == SD_STATUS_OK) sys->epoch = get_latest_epoch(); - if (sys->status != SD_STATUS_INCONSISTENT_EPOCHS) - sys->status = msg->cluster_status; + if (sys->status != SD_STATUS_INCONSISTENT_EPOCHS) { + if (msg->cluster_status == SD_STATUS_OK) { + get_vdi_bitmap_from_all(); + set_global_nr_copies(sys->nr_sobjs); + } + sys->status = msg->cluster_status; + } return; } diff --git a/collie/net.c b/collie/net.c index 137790c..09b2452 100644 --- a/collie/net.c +++ b/collie/net.c @@ -54,6 +54,7 @@ static void queue_request(struct request *req) case SD_OP_MAKE_FS: case SD_OP_GET_NODE_LIST: case SD_OP_READ_EPOCH: + case SD_OP_READ_VDIS: break; default: if (sys->status == SD_STATUS_STARTUP) @@ -88,9 +89,12 @@ static void queue_request(struct request *req) case SD_OP_MAKE_FS: case SD_OP_SHUTDOWN: case SD_OP_STAT_CLUSTER: - case SD_OP_READ_VDIS: req->work.fn = cluster_queue_request; break; + case SD_OP_READ_VDIS: + rsp->result = read_vdis(req->data, hdr->data_length, &rsp->data_length); + req->done(req); + return; default: eprintf("unknown operation %d\n", hdr->opcode); rsp->result = SD_RES_SYSTEM_ERROR; diff --git a/collie/store.c b/collie/store.c index 429124c..5c870b6 100644 --- a/collie/store.c +++ b/collie/store.c @@ -439,9 +439,6 @@ static int store_queue_request_local(struct request *req, char *buf, uint32_t ep goto out; } - if (!is_data_obj(oid)) - break; - if (hdr->flags & SD_FLAG_CMD_COW) { dprintf("%" PRIu64 "\n", hdr->cow_oid); @@ -567,7 +564,7 @@ void store_queue_request(struct work *work, int idx) ret = store_queue_request_local(req, buf, epoch); out: if (ret != SD_RES_SUCCESS) { - dprintf("failed, %d, %x, %" PRIx64" , %u, %u, %x\n", + dprintf("failed, %d, %x, %" PRIx64" , %u, %u, %d\n", idx, opcode, oid, epoch, req_epoch, ret); rsp->result = ret; } @@ -1075,6 +1072,8 @@ static int init_path(char *d, int *new) { int ret, retry = 0; struct stat s; + + *new = 0; again: ret = stat(d, &s); if (ret) { @@ -1123,12 +1122,50 @@ static int init_obj_path(char *base_path) static int init_epoch_path(char *base_path) { - int new; + int new, ret; + uint32_t epoch; + DIR *dir; + char path[1024]; + struct dirent *dent; + uint64_t oid; epoch_path = zalloc(strlen(base_path) + strlen(EPOCH_PATH) + 1); sprintf(epoch_path, "%s" EPOCH_PATH, base_path); - return init_path(epoch_path, &new); + ret = init_path(epoch_path, &new); + if (new || ret) + return ret; + + epoch = get_latest_epoch(); + + snprintf(path, sizeof(path), "%s/%08u", obj_path, epoch); + + vprintf(SDOG_INFO "found the epoch dir, %s\n", path); + + dir = opendir(path); + if (!dir) { + vprintf(SDOG_ERR "failed to open the epoch dir, %m\n"); + return SD_RES_EIO; + } + + while ((dent = readdir(dir))) { + if (!strcmp(dent->d_name, ".") || + !strcmp(dent->d_name, "..")) + continue; + + oid = strtoull(dent->d_name, NULL, 16); + + if (is_data_obj(oid)) + continue; + + vprintf(SDOG_DEBUG "found the vdi obj, %" PRIx64 " %lu\n", + oid, oid_to_bit(oid)); + + set_bit(oid_to_bit(oid), sys->vdi_inuse); + } + closedir(dir); + + return 0; } static int init_mnt_path(char *base_path) @@ -1254,6 +1291,7 @@ static int global_nr_copies(uint32_t *copies, int set) } } else { if (ret != sizeof(*copies)) { + eprintf("use 'user_xattr' option?\n"); return SD_RES_SYSTEM_ERROR; } } @@ -1268,5 +1306,5 @@ int set_global_nr_copies(uint32_t copies) int get_global_nr_copies(uint32_t *copies) { - return global_nr_copies(copies, 1); + return global_nr_copies(copies, 0); } diff --git a/include/meta.h b/include/meta.h index 99fc38a..338f660 100644 --- a/include/meta.h +++ b/include/meta.h @@ -70,4 +70,9 @@ static inline uint64_t bit_to_oid(unsigned long nr) return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT; } +static inline unsigned long oid_to_bit(uint64_t oid) +{ + return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT; +} + #endif diff --git a/lib/net.c b/lib/net.c index c85ee2d..ff261e5 100644 --- a/lib/net.c +++ b/lib/net.c @@ -412,8 +412,11 @@ int read_object(struct sheepdog_node_list_entry *e, addr_to_str(name, sizeof(name), e[n].addr, 0); fd = connect_to(name, e[n].port); - if (fd < 0) + if (fd < 0) { + printf("%s(%d): %s, %m\n", __func__, __LINE__, + name); return -1; + } memset(&hdr, 0, sizeof(hdr)); hdr.epoch = node_version; diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c index 0d7cecb..55a4fe0 100644 --- a/shepherd/shepherd.c +++ b/shepherd/shepherd.c @@ -449,6 +449,8 @@ int parse_vdi(vdi_parser_func_t func, void *data) if (ret == sizeof(i)) func(i.oid, i.name, i.snap_id, 0, &i, data); + else + printf("error %lu %" PRIx64 ", %d\n", nr, bit_to_oid(nr), ret); } -- 1.7.0 |