[Sheepdog] [PATCH v2] support vdi deletion

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Thu Apr 8 21:10:00 CEST 2010


The previous patch assumed that sheepdog has the super object,
so cannot be applied to the current git tree.
http://lists.wpkg.org/pipermail/sheepdog/2010-March/000276.html

This is against the next branch.

Thanks,

Kazutaka Morita

=
From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Subject: [PATCH v2] support vdi deletion

This patch adds preliminary support for vdi deletion.

Usage: shepherd delete vdiname [-i snapshot_id]

'-i' argument is required when you want to delete snapshot.

Note: Currently, there is a restrictoin on this feature. Data objects
are not reclaimed until all vdis with same name are deleted.

i.e.
 name        id    size    used  shared    creation time  object id
--------------------------------------------------------------------
 windows      0    4 GB    2 GB    0 MB 2010-01-09 15:01      80000
 linux        0    4 GB    1 GB    1 GB 2010-01-09 15:02      c0000
s linux        1    4 GB    2 GB    0 MB 2010-01-09 15:01      40000

To free space used by the linux vdi, you must run the following two.
 $ shepherd delete linux
 $ shepherd delete linux -i 1

Running only either of them doesn't cause object reclaiming though the
vdi disappears from the output of vdi listing.

If the vdi is cloned from another vdi, you also need to delete all the
base vdi to reclaim data objects.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/collie.h     |    3 +
 collie/group.c      |    9 ++
 collie/store.c      |   11 ++-
 collie/vdi.c        |  264 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/net.h       |    3 +
 lib/net.c           |   46 +++++++++
 shepherd/shepherd.c |   59 +++++++++++-
 7 files changed, 388 insertions(+), 7 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index 048cc7b..b183315 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -88,6 +88,8 @@ int add_vdi(char *data, int data_len, uint64_t size,
 	    uint64_t *new_oid, uint64_t base_oid, uint32_t copies,
 	    int is_snapshot);
 
+int del_vdi(char *data, int data_len, uint32_t snapid);
+
 int lookup_vdi(char *data, int data_len, uint64_t *oid, uint32_t snapid);
 
 int read_vdis(char *data, int len, unsigned int *rsp_len);
@@ -125,6 +127,7 @@ int set_cluster_ctime(uint64_t ctime);
 uint64_t get_cluster_ctime(void);
 
 int start_recovery(uint32_t epoch);
+int start_deletion(uint64_t oid);
 
 static inline int is_myself(struct sheepdog_node_list_entry *e)
 {
diff --git a/collie/group.c b/collie/group.c
index 56e0fe9..5dfd49e 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -579,6 +579,13 @@ static void vdi_op(struct vdi_op_message *msg)
 			      hdr->base_oid, hdr->copies,
 			      hdr->snapid);
 		break;
+	case SD_OP_DEL_VDI:
+		if (lookup_vm(&sys->vm_list, (char *)data)) {
+			ret = SD_RES_VDI_LOCKED;
+			break;
+		}
+		ret = del_vdi(data, hdr->data_length, hdr->snapid);
+		break;
 	case SD_OP_LOCK_VDI:
 	case SD_OP_GET_VDI_INFO:
 		ret = lookup_vdi(data, hdr->data_length, &oid, hdr->snapid);
@@ -625,6 +632,8 @@ static void vdi_op_done(struct vdi_op_message *msg)
 		set_bit(nr, sys->vdi_inuse);
 		break;
 	}
+	case SD_OP_DEL_VDI:
+		break;
 	case SD_OP_LOCK_VDI:
 		if (lookup_vm(&sys->vm_list, (char *)data)) {
 			ret = SD_RES_VDI_LOCKED;
diff --git a/collie/store.c b/collie/store.c
index 9de86e1..c62cb5b 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -493,10 +493,15 @@ static int store_queue_request_local(struct request *req, char *buf, uint32_t ep
 
 	switch (opcode) {
 	case SD_OP_REMOVE_OBJ:
-		snprintf(path, sizeof(path), "%s%" PRIx64, obj_path, oid);
+		snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, obj_path,
+			 epoch, oid);
 		ret = unlink(path);
-		if (ret)
-			ret = 1;
+		if (ret) {
+			if (errno == ENOENT)
+				ret = SD_RES_NO_OBJ;
+			else
+				ret = SD_RES_EIO;
+		}
 		break;
 	case SD_OP_READ_OBJ:
 		/*
diff --git a/collie/vdi.c b/collie/vdi.c
index f60f772..ce37c8e 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -272,9 +272,49 @@ int add_vdi(char *data, int data_len, uint64_t size,
 	return ret;
 }
 
-int del_vdi(char *name, int len)
+int del_vdi(char *data, int data_len, uint32_t snapid)
 {
-	return 0;
+	char *name = data;
+	uint64_t oid;
+	uint32_t dummy0;
+	unsigned long dummy1, dummy2;
+	int ret;
+	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+	int nr_nodes, nr_reqs;
+	static struct sheepdog_inode inode;
+
+	if (data_len != SD_MAX_VDI_LEN)
+		return SD_RES_INVALID_PARMS;
+
+	ret = do_lookup_vdi(name, strlen(name), &oid, snapid,
+			     &dummy0, &dummy1, &dummy2);
+	if (ret != SD_RES_SUCCESS)
+		return ret;
+
+	nr_nodes = build_node_list(&sys->sd_node_list, entries);
+	nr_reqs = sys->nr_sobjs;
+	if (nr_reqs > nr_nodes)
+		nr_reqs = nr_nodes;
+
+	ret = read_object(entries, nr_nodes, sys->epoch,
+			  oid, (char *)&inode, sizeof(inode), 0,
+			  nr_reqs);
+	if (ret < 0)
+		return SD_RES_EIO;
+
+	memset(inode.name, 0, sizeof(inode.name));
+
+	ret = write_object(entries, nr_nodes, sys->epoch,
+			  oid, (char *)&inode, sizeof(inode), 0,
+			   nr_reqs, 0);
+	if (ret < 0)
+		return SD_RES_EIO;
+
+	ret = start_deletion(oid);
+	if (ret < 0)
+		return SD_RES_NO_MEM;
+
+	return SD_RES_SUCCESS;
 }
 
 int read_vdis(char *data, int len, unsigned int *rsp_len)
@@ -287,3 +327,223 @@ int read_vdis(char *data, int len, unsigned int *rsp_len)
 
 	return SD_RES_SUCCESS;
 }
+
+struct deletion_work {
+	uint32_t done;
+
+	struct work work;
+	struct list_head dw_siblings;
+
+	uint64_t oid;
+
+	int count;
+	char *buf;
+};
+
+static LIST_HEAD(deletion_work_list);
+static int deleting;
+
+static void delete_one(struct work *work, int idx)
+{
+	struct deletion_work *dw = container_of(work, struct deletion_work, work);
+	uint64_t vdi_oid = *(((uint64_t *)dw->buf) + dw->count - dw->done - 1);
+	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+	int nr_nodes;
+	int ret, i;
+	static struct sheepdog_inode inode;
+
+	eprintf("%d %d, %16lx\n", dw->done, dw->count, vdi_oid);
+
+	nr_nodes = build_node_list(&sys->sd_node_list, entries);
+
+	ret = read_object(entries, nr_nodes, sys->epoch,
+			  vdi_oid, (void *)&inode, sizeof(inode), 0, sys->nr_sobjs);
+
+	if (ret != sizeof(inode)) {
+		eprintf("cannot find vdi object\n");
+		return;
+	}
+
+	for (i = 0; i < MAX_DATA_OBJS; i++) {
+		if (!inode.data_oid[i])
+			continue;
+
+		remove_object(entries, nr_nodes, sys->epoch,
+			      inode.data_oid[i], inode.nr_copies);
+	}
+
+	if (remove_object(entries, nr_nodes, sys->epoch, vdi_oid, sys->nr_sobjs))
+		eprintf("failed to remove vdi objects %lx\n", vdi_oid);
+}
+
+static void __start_deletion(struct work *work, int idx);
+static void __start_deletion_done(struct work *work, int idx);
+
+static void delete_one_done(struct work *work, int idx)
+{
+	struct deletion_work *dw = container_of(work, struct deletion_work, work);
+
+	dw->done++;
+	if (dw->done < dw->count) {
+		queue_work(dobj_queue, &dw->work);
+		return;
+	}
+
+	deleting--;
+
+	list_del(&dw->dw_siblings);
+
+	free(dw->buf);
+	free(dw);
+
+	if (!list_empty(&deletion_work_list)) {
+		dw = list_first_entry(&deletion_work_list,
+				      struct deletion_work, dw_siblings);
+
+		deleting++;
+		queue_work(dobj_queue, &dw->work);
+	}
+}
+
+static int fill_vdi_list(struct deletion_work *dw,
+			 struct sheepdog_node_list_entry *entries,
+			 int nr_entries, uint64_t root_oid)
+{
+	int ret, i;
+	static struct sheepdog_inode inode;
+	int done = dw->count;
+	uint64_t oid;
+
+	((uint64_t *)dw->buf)[dw->count++] = root_oid;
+again:
+	oid = ((uint64_t *)dw->buf)[done++];
+	ret = read_object(entries, nr_entries, sys->epoch,
+			  oid, (void *)&inode, sizeof(inode), 0, nr_entries);
+
+	if (ret != sizeof(inode)) {
+		eprintf("cannot find vdi object\n");
+		return 0;
+	}
+
+	if (inode.name[0] != '\0')
+		return 1;
+
+	for (i = 0; i < ARRAY_SIZE(inode.child_oid); i++) {
+		if (!inode.child_oid[i])
+			continue;
+
+		((uint64_t *)dw->buf)[dw->count++] = inode.child_oid[i];
+	}
+
+	if (((uint64_t *)dw->buf)[done])
+		goto again;
+
+	return 0;
+}
+
+static uint64_t get_vdi_root(struct sheepdog_node_list_entry *entries,
+			     int nr_entries, uint64_t oid)
+{
+	int ret;
+	static struct sheepdog_inode inode;
+
+next:
+	ret = read_object(entries, nr_entries, sys->epoch, oid,
+			  (void *)&inode, sizeof(inode), 0, nr_entries);
+
+	if (ret != sizeof(inode)) {
+		eprintf("cannot find vdi object\n");
+		return 0;
+	}
+
+	if (!inode.parent_oid)
+		return oid;
+
+	oid = inode.parent_oid;
+
+	goto next;
+}
+
+static void __start_deletion(struct work *work, int idx)
+{
+	struct deletion_work *dw = container_of(work, struct deletion_work, work);
+	struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+	int nr_nodes, ret;
+	uint64_t root_oid;
+
+	nr_nodes = build_node_list(&sys->sd_node_list, entries);
+
+	root_oid = get_vdi_root(entries, nr_nodes, dw->oid);
+	if (!root_oid)
+		goto fail;
+
+	ret = fill_vdi_list(dw, entries, nr_nodes, root_oid);
+	if (ret)
+		goto fail;
+
+	return;
+
+fail:
+	dw->count = 0;
+	return;
+}
+
+static void __start_deletion_done(struct work *work, int idx)
+{
+	struct deletion_work *dw = container_of(work, struct deletion_work, work);
+
+	dprintf("%d\n", dw->count);
+
+	if (dw->count) {
+		dw->work.fn = delete_one;
+		dw->work.done = delete_one_done;
+
+		queue_work(dobj_queue, &dw->work);
+		return;
+	}
+
+	deleting--;
+
+	list_del(&dw->dw_siblings);
+
+	free(dw->buf);
+	free(dw);
+
+	if (!list_empty(&deletion_work_list)) {
+		dw = list_first_entry(&deletion_work_list,
+				      struct deletion_work, dw_siblings);
+
+		deleting++;
+		queue_work(dobj_queue, &dw->work);
+	}
+}
+
+int start_deletion(uint64_t oid)
+{
+	struct deletion_work *dw;
+
+	dw = zalloc(sizeof(struct deletion_work));
+	if (!dw)
+		return -1;
+
+	dw->buf = zalloc(1 << 20); /* FIXME: handle larger buffer */
+	if (!dw->buf) {
+		free(dw);
+		return -1;
+	}
+
+	dw->count = 0;
+	dw->oid = oid;
+
+	dw->work.fn = __start_deletion;
+	dw->work.done = __start_deletion_done;
+
+	list_add_tail(&dw->dw_siblings, &deletion_work_list);
+
+	if (!deleting) {
+		deleting++;
+		queue_work(dobj_queue, &dw->work);
+	}
+
+	return 0;
+}
diff --git a/include/net.h b/include/net.h
index f456eb0..618d202 100644
--- a/include/net.h
+++ b/include/net.h
@@ -43,6 +43,9 @@ int read_object(struct sheepdog_node_list_entry *e,
 		int nodes, uint32_t node_version,
 		uint64_t oid, char *data, unsigned int datalen,
 		uint64_t offset, int nr);
+int remove_object(struct sheepdog_node_list_entry *e,
+		  int nodes, uint32_t node_version,
+		  uint64_t oid, int nr);
 
 int exec_reqs(struct sheepdog_node_list_entry *e,
 	      int nodes, uint32_t node_version, uint64_t oid, struct sd_req *hdr,
diff --git a/lib/net.c b/lib/net.c
index ff261e5..40d3d00 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -439,6 +439,52 @@ int read_object(struct sheepdog_node_list_entry *e,
 	return -1;
 }
 
+int remove_object(struct sheepdog_node_list_entry *e,
+		  int nodes, uint32_t node_version,
+		  uint64_t oid, int nr)
+{
+	char name[128];
+	struct sd_obj_req hdr;
+	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
+	int i = 0, n, fd, ret;
+
+	if (nr > nodes)
+		nr = nodes;
+
+	for (i = 0; i < nr; i++) {
+		unsigned wlen = 0, rlen = 0;
+
+		n = obj_to_sheep(e, nodes, oid, i);
+
+		addr_to_str(name, sizeof(name), e[n].addr, 0);
+
+		fd = connect_to(name, e[n].port);
+		if (fd < 0) {
+			rsp->result = SD_RES_EIO;
+			return -1;
+		}
+
+		memset(&hdr, 0, sizeof(hdr));
+		hdr.epoch = node_version;
+		hdr.opcode = SD_OP_REMOVE_OBJ;
+		hdr.oid = oid;
+
+		hdr.flags = 0;
+		hdr.data_length = rlen;
+
+		ret = exec_req(fd, (struct sd_req *)&hdr, NULL, &wlen, &rlen);
+		close(fd);
+
+		if (ret)
+			return -1;
+	}
+
+	if (rsp->result != SD_RES_SUCCESS)
+		return -1;
+
+	return 0;
+}
+
 /* TODO: clean up with the above functions */
 int exec_reqs(struct sheepdog_node_list_entry *e,
 	      int nodes, uint32_t node_version, uint64_t oid, struct sd_req *hdr,
diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c
index 087c397..471c0e1 100644
--- a/shepherd/shepherd.c
+++ b/shepherd/shepherd.c
@@ -79,6 +79,7 @@ static void usage(int status)
 \n\
 Command syntax:\n\
   mkfs [--copies=N]\n\
+  delete [-i snapshot_id] vdiname\n\
   info -t (vdi|dog|sheep|obj|cluster) [-f (list|tree|graph)] [-H (on|off)] [-R (on|off)] [-i N] [-e N] [vdiname]\n\
   debug -o node_version\n\
   shutdown\n\
@@ -262,6 +263,56 @@ static int mkfs(int copies)
 	return 0;
 }
 
+static int delete(char *data, uint32_t id)
+{
+	int fd, ret;
+	struct sd_vdi_req hdr;
+	struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr;
+	unsigned rlen, wlen;
+	char vdiname[SD_MAX_VDI_LEN];
+
+	fd = connect_to("localhost", sdport);
+	if (fd < 0)
+		return -1;
+
+	memset(&hdr, 0, sizeof(hdr));
+
+	rlen = 0;
+	wlen = sizeof(vdiname);
+
+	hdr.opcode = SD_OP_DEL_VDI;
+	if (id != ~0)
+		hdr.snapid = id;
+	hdr.epoch = node_list_version;
+	hdr.flags = SD_FLAG_CMD_WRITE;
+	hdr.data_length = wlen;
+	strncpy(vdiname, data, sizeof(vdiname));
+
+	ret = exec_req(fd, (struct sd_req *)&hdr, vdiname, &wlen, &rlen);
+	close(fd);
+
+	if (ret != SD_RES_SUCCESS) {
+		fprintf(stderr, "failed to connect the dog\n");
+		return ret;
+	}
+
+	if (rsp->result != SD_RES_SUCCESS) {
+		switch (rsp->result) {
+		case SD_RES_VDI_LOCKED:
+			fprintf(stderr, "the vdi is locked\n");
+			break;
+		case SD_RES_NO_VDI:
+			fprintf(stderr, "no such vdi\n");
+			break;
+		default:
+			fprintf(stderr, "error, %d\n", rsp->result);
+			break;
+		}
+	}
+
+	return 0;
+}
+
 static int debug(char *op, char *arg)
 {
 	struct sd_req hdr;
@@ -458,9 +509,11 @@ int parse_vdi(vdi_parser_func_t func, void *data)
 		ret = read_object(node_list_entries, nr_nodes, node_list_version,
 				  bit_to_oid(nr), (void *)&i, sizeof(i), 0, nr_nodes);
 
-		if (ret == sizeof(i))
+		if (ret == sizeof(i)) {
+			if (i.name[0] == '\0') /* deleted */
+				continue;
 			func(i.oid, i.name, i.snap_id, 0, &i, data);
-		else
+		} else
 			printf("error %lu %" PRIx64 ", %d\n", nr, bit_to_oid(nr), ret);
 
 	}
@@ -1232,6 +1285,8 @@ int main(int argc, char **argv)
 		info(type, format, name, highlight, real_time, index);
 	} else if (!strcasecmp(command, "mkfs"))
 		ret = mkfs(copies);
+	else if (!strcasecmp(command, "delete"))
+		ret = delete(argv[optind], index);
 	else if (!strcasecmp(command, "debug"))
 		ret = debug(op, argv[optind]);
 	else if (!strcasecmp(command, "shutdown"))
-- 
1.5.6.5




More information about the sheepdog mailing list