This patch adds preliminary support for vdi deletion. Usage: shepherd delete vdiname [-i snapshot_id] '-i' argument is required when you want to delete snapshot. Note: Currently, there is a restrictoin on this feature. Data objects are not reclaimed until all vdis with same name are deleted. i.e. name id size used shared creation time object id -------------------------------------------------------------------- windows 0 4 GB 2 GB 0 MB 2010-01-09 15:01 80000 linux 0 4 GB 1 GB 1 GB 2010-01-09 15:02 c0000 s linux 1 4 GB 2 GB 0 MB 2010-01-09 15:01 40000 To free space used by the linux vdi, you must run the following two. $ shepherd delete linux $ shepherd delete linux -i 1 Running only either of them doesn't cause object reclaiming though the vdi disappears from the output of vdi listing. If the vdi is cloned from another vdi, you also need to delete all the base vdi to reclaim data objects. Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp> --- collie/collie.h | 3 + collie/group.c | 12 ++ collie/net.c | 1 + collie/store.c | 361 ++++++++++++++++++++++++++++++++++++++++++++++++++- collie/vdi.c | 25 ++++- include/meta.h | 1 + include/net.h | 3 + lib/net.c | 46 +++++++ shepherd/shepherd.c | 51 +++++++ 9 files changed, 497 insertions(+), 6 deletions(-) diff --git a/collie/collie.h b/collie/collie.h index 5cd2383..9ccc1d8 100644 --- a/collie/collie.h +++ b/collie/collie.h @@ -83,6 +83,8 @@ int add_vdi(char *buf, int len, uint64_t size, uint64_t *added_oid, uint64_t base_oid, uint32_t tag, int copies, uint16_t flags); +int del_vdi(char *name, uint64_t oid); + int lookup_vdi(char *filename, uint64_t * oid, uint32_t tag, int do_lock, int *current); @@ -118,5 +120,6 @@ int set_cluster_ctime(uint64_t ctime); uint64_t get_cluster_ctime(void); int start_recovery(uint32_t epoch, int add); +int start_deletion(uint64_t oid); #endif diff --git a/collie/group.c b/collie/group.c index a49c1be..fadb6ee 100644 --- a/collie/group.c +++ b/collie/group.c @@ -514,6 +514,16 @@ static void vdi_op(struct vdi_op_message *msg) ret = add_vdi(data, hdr->data_length, hdr->vdi_size, &oid, hdr->base_oid, hdr->tag, hdr->copies, hdr->flags); break; + case SD_OP_DEL_VDI: + ret = lookup_vdi(data, &oid, hdr->tag, 1, &is_current); + if (ret != SD_RES_SUCCESS) + break; + if (lookup_vm(&sys->vm_list, (char *)data)) { + ret = SD_RES_VDI_LOCKED; + break; + } + ret = del_vdi(data, oid); + break; case SD_OP_LOCK_VDI: case SD_OP_GET_VDI_INFO: ret = lookup_vdi(data, &oid, hdr->tag, 1, &is_current); @@ -557,6 +567,8 @@ static void vdi_op_done(struct vdi_op_message *msg) switch (hdr->opcode) { case SD_OP_NEW_VDI: break; + case SD_OP_DEL_VDI: + break; case SD_OP_LOCK_VDI: if (lookup_vm(&sys->vm_list, (char *)data)) { ret = SD_RES_VDI_LOCKED; diff --git a/collie/net.c b/collie/net.c index 04f9547..d226658 100644 --- a/collie/net.c +++ b/collie/net.c @@ -93,6 +93,7 @@ static void queue_request(struct request *req) break; case SD_OP_SO: case SD_OP_SO_NEW_VDI: + case SD_OP_SO_DEL_VDI: case SD_OP_SO_LOOKUP_VDI: case SD_OP_SO_READ_VDIS: case SD_OP_SO_STAT: diff --git a/collie/store.c b/collie/store.c index a36c30f..e23e6ad 100644 --- a/collie/store.c +++ b/collie/store.c @@ -24,6 +24,7 @@ #define ANAME_LAST_OID "user.sheepdog.last_oid" #define ANAME_CTIME "user.sheepdog.ctime" #define ANAME_COPIES "user.sheepdog.copies" +#define ANAME_FLAGS "user.sheepdog.flags" #define ANAME_OBJECT_UPDATED "user.sheepdog.object_updated" static char *vdi_path; @@ -473,10 +474,15 @@ static int store_queue_request_local(struct request *req, char *buf, uint32_t ep switch (opcode) { case SD_OP_REMOVE_OBJ: - snprintf(path, sizeof(path), "%s%" PRIx64, obj_path, oid); + snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, obj_path, + epoch, oid); ret = unlink(path); - if (ret) - ret = 1; + if (ret) { + if (errno == ENOENT) + ret = SD_RES_NO_OBJ; + else + ret = SD_RES_EIO; + } break; case SD_OP_READ_OBJ: /* @@ -608,14 +614,16 @@ static int so_read_vdis(struct request *req) struct dirent *dent, **vdent; char *p; char vpath[1024]; + char path[1024]; struct sheepdog_vdi_info *sde = req->data; + uint32_t flags; dir = opendir(vdi_path); if (!dir) return SD_RES_NO_SUPER_OBJ; while ((dent = readdir(dir))) { - int i, n; + int i, n, fd, ret; if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) @@ -626,6 +634,22 @@ static int so_read_vdis(struct request *req) n = scandir(vpath, &vdent, filter, vdi_sort); for (i = 0; i < n; i++) { + snprintf(path, sizeof(path), "%s/%s", vpath, + vdent[i]->d_name); + + fd = open(path, O_RDONLY); + if (fd < 0) { + eprintf("%m\n"); + continue; + } + + ret = fgetxattr(fd, ANAME_FLAGS, &flags, sizeof(flags)); + if (ret == sizeof(flags)) { + if (flags & FLAG_DELETED) + continue; + } + + p = strchr(vdent[i]->d_name, '-'); dprintf("%s\n", vdent[i]->d_name); @@ -661,6 +685,85 @@ static int so_read_vdis(struct request *req) return SD_RES_SUCCESS; } +static int so_del_vdi(struct request *req) +{ + struct sd_so_req *so_req = (struct sd_so_req *)&req->rq; + int result = SD_RES_SUCCESS; + char path[PATH_MAX], fname[NAME_MAX]; + int fd, ret; + uint32_t flags; + DIR *dir; + struct dirent *dent; + + snprintf(path, sizeof(path), "%s%s", vdi_path, (char *)req->data); + + dir = opendir(path); + if (!dir) { + if (errno == ENOENT) + result = SD_RES_NO_VDI; + else { + eprintf("%m\n"); + result = SD_RES_EIO; + } + goto out; + } + + memset(fname, 0, sizeof(fname)); + while ((dent = readdir(dir))) { + uint64_t oid; + + if (!strcmp(dent->d_name, ".") || + !strcmp(dent->d_name, "..")) + continue; + + oid = strtoul(dent->d_name, NULL, 16); + if (oid == so_req->oid) { + strcpy(fname, dent->d_name); + break; + } + } + closedir(dir); + + if (fname[0] == '\0') { + result = SD_RES_NO_VDI; + goto out; + } + + snprintf(path, sizeof(path), "%s%s/%s", vdi_path, (char *)req->data, fname); + + fd = open(path, O_RDONLY); + if (fd < 0) { + if (errno == ENOENT) + result = SD_RES_NO_VDI; + else { + eprintf("%m\n"); + result = SD_RES_EIO; + } + goto out; + } + + ret = fgetxattr(fd, ANAME_FLAGS, &flags, sizeof(flags)); + if (ret == sizeof(flags)) { + if (flags & FLAG_DELETED) { + result = SD_RES_NO_VDI; + goto out; + } + } else + flags = 0; + + flags |= FLAG_DELETED; + ret = fsetxattr(fd, ANAME_FLAGS, &flags, sizeof(flags), 0); + if (ret) { + eprintf("%m\n"); + close(fd); + result = SD_RES_EIO; + goto out; + } + +out: + return result; +} + static int so_lookup_vdi(struct request *req) { struct sd_so_req *hdr = (struct sd_so_req *)&req->rq; @@ -963,6 +1066,9 @@ void so_queue_request(struct work *work, int idx) rsp->oid = last_oid; break; + case SD_OP_SO_DEL_VDI: + result = so_del_vdi(req); + break; case SD_OP_SO_LOOKUP_VDI: result = so_lookup_vdi(req); break; @@ -1507,6 +1613,253 @@ int start_recovery(uint32_t epoch, int add) return 0; } +struct deletion_work { + uint32_t done; + + struct work work; + struct list_head dw_siblings; + + uint64_t oid; + + int count; + char *buf; +}; + +static LIST_HEAD(deletion_work_list); +static int deleting; + +static void delete_one(struct work *work, int idx) +{ + struct deletion_work *dw = container_of(work, struct deletion_work, work); + uint64_t vdi_oid = *(((uint64_t *)dw->buf) + dw->count - dw->done - 1); + struct sheepdog_node_list_entry entries[SD_MAX_NODES]; + int nr_nodes; + int ret, i; + char *buf = zero_block + idx * SD_DATA_OBJ_SIZE; + struct sheepdog_inode *inode = (struct sheepdog_inode *)buf; + + eprintf("%d %d, %16lx\n", dw->done, dw->count, vdi_oid); + + nr_nodes = build_node_list(&sys->sd_node_list, entries); + + ret = read_object(entries, nr_nodes, sys->epoch, + vdi_oid, (void *)inode, sizeof(*inode), 0, sys->nr_sobjs); + + if (ret != sizeof(*inode)) { + eprintf("cannot find vdi object\n"); + return; + } + + for (i = 0; i < MAX_DATA_OBJS; i++) { + if (!inode->data_oid[i]) + continue; + + remove_object(entries, nr_nodes, sys->epoch, + inode->data_oid[i], inode->nr_copies); + } + + if (remove_object(entries, nr_nodes, sys->epoch, vdi_oid, sys->nr_sobjs)) + eprintf("failed to remove vdi objects %lx\n", vdi_oid); +} + +static void __start_deletion(struct work *work, int idx); +static void __start_deletion_done(struct work *work, int idx); + +static void delete_one_done(struct work *work, int idx) +{ + struct deletion_work *dw = container_of(work, struct deletion_work, work); + + dw->done++; + if (dw->done < dw->count) { + queue_work(dobj_queue, &dw->work); + return; + } + + deleting--; + + list_del(&dw->dw_siblings); + + free(dw->buf); + free(dw); + + if (!list_empty(&deletion_work_list)) { + dw = list_first_entry(&deletion_work_list, + struct deletion_work, dw_siblings); + + deleting++; + queue_work(dobj_queue, &dw->work); + } +} + +static int fill_vdi_list(struct deletion_work *dw, + struct sheepdog_node_list_entry *entries, + int nr_entries, uint64_t root_oid, + struct sheepdog_vdi_info *sde, int nr_sde, + char *buf) +{ + int ret, i; + struct sheepdog_inode *inode = (struct sheepdog_inode *)buf; + int done = dw->count; + uint64_t oid; + + ((uint64_t *)dw->buf)[dw->count++] = root_oid; +again: + oid = ((uint64_t *)dw->buf)[done++]; + ret = read_object(entries, nr_entries, sys->epoch, + oid, (void *)inode, sizeof(*inode), 0, nr_entries); + + if (ret != sizeof(*inode)) { + eprintf("cannot find vdi object\n"); + return 0; + } + + for (i = 0; i < nr_sde; i++) { + if (sde[i].oid == oid) { + if (!(sde[i].flags & FLAG_DELETED)) + return 1; + break; + } + } + + for (i = 0; i < ARRAY_SIZE(inode->child_oid); i++) { + if (!inode->child_oid[i]) + continue; + + ((uint64_t *)dw->buf)[dw->count++] = inode->child_oid[i]; + } + + if (((uint64_t *)dw->buf)[done]) + goto again; + + return 0; +} + +static uint64_t get_vdi_root(struct sheepdog_node_list_entry *entries, + int nr_entries, uint64_t oid, char *buf) +{ + int ret; + struct sheepdog_inode *inode = (struct sheepdog_inode *)buf; + +next: + ret = read_object(entries, nr_entries, sys->epoch, oid, + (void *)inode, sizeof(*inode), 0, nr_entries); + + if (ret != sizeof(*inode)) { + eprintf("cannot find vdi object\n"); + return 0; + } + + if (!inode->parent_oid) + return oid; + + oid = inode->parent_oid; + + goto next; +} + +static void __start_deletion(struct work *work, int idx) +{ + struct deletion_work *dw = container_of(work, struct deletion_work, work); + struct sheepdog_node_list_entry entries[SD_MAX_NODES]; + int nr_nodes; + int ret = 0; + struct sd_so_req hdr; + struct sd_so_rsp *rsp = (struct sd_so_rsp *)&hdr; + uint64_t root_oid; + char *buf = zero_block + idx * SD_DATA_OBJ_SIZE; + struct sheepdog_vdi_info *sde; + int nr_sde; + size_t sde_size; + + sde = (struct sheepdog_vdi_info *)(buf + sizeof(struct sheepdog_inode)); + sde_size = SD_DATA_OBJ_SIZE - sizeof(struct sheepdog_inode); + + nr_nodes = build_node_list(&sys->sd_node_list, entries); + + memset(&hdr, 0, sizeof(hdr)); + + dw->buf = zalloc(1 << 20); /* FIXME: handle larger buffer */ + if (!dw->buf) + goto fail; + + hdr.opcode = SD_OP_SO_READ_VDIS; + + ret = exec_reqs(entries, nr_nodes, sys->epoch, SD_DIR_OID, + (struct sd_req *)&hdr, (char *)sde, 0, sde_size, + nr_nodes, 1); + + if (ret < 0) + goto fail; + + nr_sde = rsp->data_length / sizeof(*sde); + + root_oid = get_vdi_root(entries, nr_nodes, dw->oid, buf); + if (!root_oid) + goto fail; + + ret = fill_vdi_list(dw, entries, nr_nodes, root_oid, sde, nr_sde, buf); + if (ret) + goto fail; + + return; + +fail: + dw->count = 0; + return; +} + +static void __start_deletion_done(struct work *work, int idx) +{ + struct deletion_work *dw = container_of(work, struct deletion_work, work); + + if (dw->count) { + dw->work.fn = delete_one; + dw->work.done = delete_one_done; + + queue_work(dobj_queue, &dw->work); + return; + } + + deleting--; + + list_del(&dw->dw_siblings); + + free(dw->buf); + free(dw); + + if (!list_empty(&deletion_work_list)) { + dw = list_first_entry(&deletion_work_list, + struct deletion_work, dw_siblings); + + deleting++; + queue_work(dobj_queue, &dw->work); + } +} + +int start_deletion(uint64_t oid) +{ + struct deletion_work *dw; + + dw = zalloc(sizeof(struct deletion_work)); + if (!dw) + return -1; + + dw->count = 0; + dw->oid = oid; + + dw->work.fn = __start_deletion; + dw->work.done = __start_deletion_done; + + list_add_tail(&dw->dw_siblings, &deletion_work_list); + + if (!deleting) { + deleting++; + queue_work(dobj_queue, &dw->work); + } + + return 0; +} + static int init_path(char *d, int *new) { int ret, retry = 0; diff --git a/collie/vdi.c b/collie/vdi.c index 5904488..35ed25b 100644 --- a/collie/vdi.c +++ b/collie/vdi.c @@ -133,9 +133,30 @@ int add_vdi(char *name, int len, uint64_t size, return ret; } -int del_vdi(char *name, int len) +int del_vdi(char *name, uint64_t oid) { - return 0; + struct sheepdog_node_list_entry entries[SD_MAX_NODES]; + int nr_nodes; + int ret = 0; + struct sd_so_req hdr; + struct sd_so_rsp *rsp = (struct sd_so_rsp *)&hdr;; + + nr_nodes = build_node_list(&sys->sd_node_list, entries); + + memset(&hdr, 0, sizeof(hdr)); + hdr.opcode = SD_OP_SO_DEL_VDI; + hdr.oid = oid; + ret = exec_reqs(entries, nr_nodes, sys->epoch, SD_DIR_OID, + (struct sd_req *)&hdr, name, strlen(name) + 1, 0, + nr_nodes, 1); + if (ret < 0 || rsp->result != SD_RES_SUCCESS) + return rsp->result; + + ret = start_deletion(oid); + if (ret) + return SD_RES_NO_MEM; + + return SD_RES_SUCCESS; } int lookup_vdi(char *filename, uint64_t * oid, uint32_t tag, int do_lock, diff --git a/include/meta.h b/include/meta.h index 67d2b11..5815c51 100644 --- a/include/meta.h +++ b/include/meta.h @@ -50,6 +50,7 @@ static inline int is_data_obj(uint64_t oid) #define SHEEPDOG_SUPER_OBJ_SIZE (UINT64_C(1) << 12) #define FLAG_CURRENT 1 +#define FLAG_DELETED 2 struct sheepdog_vdi_info { uint64_t oid; diff --git a/include/net.h b/include/net.h index f456eb0..618d202 100644 --- a/include/net.h +++ b/include/net.h @@ -43,6 +43,9 @@ int read_object(struct sheepdog_node_list_entry *e, int nodes, uint32_t node_version, uint64_t oid, char *data, unsigned int datalen, uint64_t offset, int nr); +int remove_object(struct sheepdog_node_list_entry *e, + int nodes, uint32_t node_version, + uint64_t oid, int nr); int exec_reqs(struct sheepdog_node_list_entry *e, int nodes, uint32_t node_version, uint64_t oid, struct sd_req *hdr, diff --git a/lib/net.c b/lib/net.c index c85ee2d..d36069a 100644 --- a/lib/net.c +++ b/lib/net.c @@ -436,6 +436,52 @@ int read_object(struct sheepdog_node_list_entry *e, return -1; } +int remove_object(struct sheepdog_node_list_entry *e, + int nodes, uint32_t node_version, + uint64_t oid, int nr) +{ + char name[128]; + struct sd_obj_req hdr; + struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; + int i = 0, n, fd, ret; + + if (nr > nodes) + nr = nodes; + + for (i = 0; i < nr; i++) { + unsigned wlen = 0, rlen = 0; + + n = obj_to_sheep(e, nodes, oid, i); + + addr_to_str(name, sizeof(name), e[n].addr, 0); + + fd = connect_to(name, e[n].port); + if (fd < 0) { + rsp->result = SD_RES_EIO; + return -1; + } + + memset(&hdr, 0, sizeof(hdr)); + hdr.epoch = node_version; + hdr.opcode = SD_OP_REMOVE_OBJ; + hdr.oid = oid; + + hdr.flags = 0; + hdr.data_length = rlen; + + ret = exec_req(fd, (struct sd_req *)&hdr, NULL, &wlen, &rlen); + close(fd); + + if (ret) + return -1; + } + + if (rsp->result != SD_RES_SUCCESS) + return -1; + + return 0; +} + /* TODO: clean up with the above functions */ int exec_reqs(struct sheepdog_node_list_entry *e, int nodes, uint32_t node_version, uint64_t oid, struct sd_req *hdr, diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c index 5d89710..a4f3210 100644 --- a/shepherd/shepherd.c +++ b/shepherd/shepherd.c @@ -79,6 +79,7 @@ static void usage(int status) \n\ Command syntax:\n\ mkfs [--copies=N]\n\ + delete [-i snapshot_id] vdiname tag\n\ info -t (vdi|dog|sheep|obj|cluster) [-f (list|tree|graph)] [-H (on|off)] [-R (on|off)] [-i N] [-e N] [vdiname]\n\ debug -o node_version\n\ shutdown\n\ @@ -257,6 +258,54 @@ static int mkfs(int copies) return 0; } +static int delete(char *vdiname, uint32_t id) +{ + int fd, ret; + struct sd_vdi_req hdr; + struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr; + unsigned rlen, wlen; + + fd = connect_to("localhost", sdport); + if (fd < 0) + return -1; + + memset(&hdr, 0, sizeof(hdr)); + + rlen = 0; + wlen = strlen(vdiname) + 1; + + hdr.opcode = SD_OP_DEL_VDI; + if (id != ~0) + hdr.tag = id; + hdr.epoch = node_list_version; + hdr.flags = SD_FLAG_CMD_WRITE; + hdr.data_length = wlen; + + ret = exec_req(fd, (struct sd_req *)&hdr, vdiname, &wlen, &rlen); + close(fd); + + if (ret != SD_RES_SUCCESS) { + fprintf(stderr, "failed to connect the dog\n"); + return ret; + } + + if (rsp->result != SD_RES_SUCCESS) { + switch (rsp->result) { + case SD_RES_VDI_LOCKED: + fprintf(stderr, "the vdi is locked\n"); + break; + case SD_RES_NO_VDI: + fprintf(stderr, "no such vdi\n"); + break; + default: + fprintf(stderr, "error, %d\n", rsp->result); + break; + } + } + + return 0; +} + static int debug(char *op, char *arg) { struct sd_req hdr; @@ -1213,6 +1262,8 @@ int main(int argc, char **argv) info(type, format, name, highlight, real_time, index); } else if (!strcasecmp(command, "mkfs")) ret = mkfs(copies); + else if (!strcasecmp(command, "delete")) + ret = delete(argv[optind], index); else if (!strcasecmp(command, "debug")) ret = debug(op, argv[optind]); else if (!strcasecmp(command, "shutdown")) -- 1.5.6.5 |