[Sheepdog] [PATCH 0/2] the super object removal

Fri Mar 26 10:11:55 CET 2010

On Fri, 26 Mar 2010 10:17:19 +0900
FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp> wrote:

> We use the super object to manage the list of the existing VDIs.
> 
> Unlike data objects, the super object is a directory and the directory
> is replicated on multiple nodes. I concluded that the super object
> recovery code is too tricky and complicated.
> 
> So this patchset removes the super object. We manage the list of the
> existing VDIs like p2p applications. At startup, the nodes build the
> list of VDIs.
> 
> TODO: restart support

Done. This can be applied on the top of the patchset.

=
From: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
Subject: [PATCH] add reboot support without the super object

Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
 collie/group.c      |   64 +++++++++++++++++++++++++++++++++++++++++++++++----
 collie/net.c        |    6 ++++-
 collie/store.c      |   52 +++++++++++++++++++++++++++++++++++-----
 include/meta.h      |    5 ++++
 lib/net.c           |    5 +++-
 shepherd/shepherd.c |    2 +
 6 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/collie/group.c b/collie/group.c
index 836de83..2067870 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -199,9 +199,6 @@ void cluster_queue_request(struct work *work, int idx)
 
 		rsp->result = SD_RES_SUCCESS;
 		break;
-	case SD_OP_READ_VDIS:
-		rsp->result = read_vdis(req->data, hdr->data_length, &rsp->data_length);
-		break;
 	default:
 		/* forward request to group */
 		goto forward;
@@ -431,6 +428,58 @@ static void join(struct join_message *msg)
 		msg->cluster_status = sys->status;
 }
 
+static void get_vdi_bitmap_from_all(void)
+{
+	struct sd_req hdr;
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+	int i, j, ret, nr_nodes, fd;
+	/* fixme: we need this until starting up. */
+	static DECLARE_BITMAP(tmp_vdi_inuse, SD_NR_VDIS);
+	struct sheepdog_node_list_entry entry[SD_MAX_NODES];
+	unsigned int rlen, wlen;
+	char host[128];
+
+	/*
+	 * we don't need the proper order but this is the simplest
+	 * way.
+	 */
+	nr_nodes = build_node_list(&sys->sd_node_list, entry);
+
+	for (i = 0; i < nr_nodes; i++) {
+		if (!memcmp(&sys->this_node, &entry[i], sizeof(sys->this_node)))
+			continue;
+
+		addr_to_str(host, sizeof(host), entry[i].addr, 0);
+
+		fd = connect_to(host, entry[i].port);
+		if (fd < 0) {
+			vprintf(SDOG_ERR "can't get the vdi bitmap %s, %m\n", host);
+		}
+
+		vprintf(SDOG_ERR "get the vdi bitmap %d %s\n", i, host);
+
+		memset(&hdr, 0, sizeof(hdr));
+		hdr.opcode = SD_OP_READ_VDIS;
+		hdr.epoch = sys->epoch;
+		hdr.data_length = sizeof(tmp_vdi_inuse);
+		rlen = hdr.data_length;
+		wlen = 0;
+
+		ret = exec_req(fd, &hdr, (char *)tmp_vdi_inuse,
+			       &wlen, &rlen);
+
+		close(fd);
+
+		if (ret || rsp->result != SD_RES_SUCCESS) {
+			vprintf(SDOG_ERR "can't get the vdi bitmap %d %d\n", ret,
+				rsp->result);
+		}
+
+		for (j = 0; j < ARRAY_SIZE(sys->vdi_inuse); j++)
+			sys->vdi_inuse[j] |= tmp_vdi_inuse[j];
+	}
+}
+
 static void update_cluster_info(struct join_message *msg)
 {
 	int i;
@@ -498,9 +547,14 @@ out:
 	if (sys->status == SD_STATUS_STARTUP && msg->cluster_status == SD_STATUS_OK)
 		sys->epoch = get_latest_epoch();
 
-	if (sys->status != SD_STATUS_INCONSISTENT_EPOCHS)
-		sys->status = msg->cluster_status;
+	if (sys->status != SD_STATUS_INCONSISTENT_EPOCHS) {
+		if (msg->cluster_status == SD_STATUS_OK) {
+			get_vdi_bitmap_from_all();
+			set_global_nr_copies(sys->nr_sobjs);
+		}
 
+		sys->status = msg->cluster_status;
+	}
 	return;
 }
 
diff --git a/collie/net.c b/collie/net.c
index 137790c..09b2452 100644
--- a/collie/net.c
+++ b/collie/net.c
@@ -54,6 +54,7 @@ static void queue_request(struct request *req)
 		case SD_OP_MAKE_FS:
 		case SD_OP_GET_NODE_LIST:
 		case SD_OP_READ_EPOCH:
+		case SD_OP_READ_VDIS:
 			break;
 		default:
 			if (sys->status == SD_STATUS_STARTUP)
@@ -88,9 +89,12 @@ static void queue_request(struct request *req)
 	case SD_OP_MAKE_FS:
 	case SD_OP_SHUTDOWN:
 	case SD_OP_STAT_CLUSTER:
-	case SD_OP_READ_VDIS:
 		req->work.fn = cluster_queue_request;
 		break;
+	case SD_OP_READ_VDIS:
+		rsp->result = read_vdis(req->data, hdr->data_length, &rsp->data_length);
+		req->done(req);
+		return;
 	default:
 		eprintf("unknown operation %d\n", hdr->opcode);
 		rsp->result = SD_RES_SYSTEM_ERROR;
diff --git a/collie/store.c b/collie/store.c
index 429124c..5c870b6 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -439,9 +439,6 @@ static int store_queue_request_local(struct request *req, char *buf, uint32_t ep
 			goto out;
 		}
 
-		if (!is_data_obj(oid))
-			break;
-
 		if (hdr->flags & SD_FLAG_CMD_COW) {
 			dprintf("%" PRIu64 "\n", hdr->cow_oid);
 
@@ -567,7 +564,7 @@ void store_queue_request(struct work *work, int idx)
 	ret = store_queue_request_local(req, buf, epoch);
 out:
 	if (ret != SD_RES_SUCCESS) {
-		dprintf("failed, %d, %x, %" PRIx64" , %u, %u, %x\n",
+		dprintf("failed, %d, %x, %" PRIx64" , %u, %u, %d\n",
 			idx, opcode, oid, epoch, req_epoch, ret);
 		rsp->result = ret;
 	}
@@ -1075,6 +1072,8 @@ static int init_path(char *d, int *new)
 {
 	int ret, retry = 0;
 	struct stat s;
+
+	*new = 0;
 again:
 	ret = stat(d, &s);
 	if (ret) {
@@ -1123,12 +1122,50 @@ static int init_obj_path(char *base_path)
 
 static int init_epoch_path(char *base_path)
 {
-	int new;
+	int new, ret;
+	uint32_t epoch;
+	DIR *dir;
+	char path[1024];
+	struct dirent *dent;
+	uint64_t oid;
 
 	epoch_path = zalloc(strlen(base_path) + strlen(EPOCH_PATH) + 1);
 	sprintf(epoch_path, "%s" EPOCH_PATH, base_path);
 
-	return init_path(epoch_path, &new);
+	ret = init_path(epoch_path, &new);
+	if (new || ret)
+		return ret;
+
+	epoch = get_latest_epoch();
+
+	snprintf(path, sizeof(path), "%s/%08u", obj_path, epoch);
+
+	vprintf(SDOG_INFO "found the epoch dir, %s\n", path);
+
+	dir = opendir(path);
+	if (!dir) {
+		vprintf(SDOG_ERR "failed to open the epoch dir, %m\n");
+		return SD_RES_EIO;
+	}
+
+	while ((dent = readdir(dir))) {
+		if (!strcmp(dent->d_name, ".") ||
+		    !strcmp(dent->d_name, ".."))
+			continue;
+
+		oid = strtoull(dent->d_name, NULL, 16);
+
+		if (is_data_obj(oid))
+			continue;
+
+		vprintf(SDOG_DEBUG "found the vdi obj, %" PRIx64 " %lu\n",
+			oid, oid_to_bit(oid));
+
+		set_bit(oid_to_bit(oid), sys->vdi_inuse);
+	}
+	closedir(dir);
+
+	return 0;
 }
 
 static int init_mnt_path(char *base_path)
@@ -1254,6 +1291,7 @@ static int global_nr_copies(uint32_t *copies, int set)
 		}
 	} else {
 		if (ret != sizeof(*copies)) {
+			eprintf("use 'user_xattr' option?\n");
 			return SD_RES_SYSTEM_ERROR;
 		}
 	}
@@ -1268,5 +1306,5 @@ int set_global_nr_copies(uint32_t copies)
 
 int get_global_nr_copies(uint32_t *copies)
 {
-	return global_nr_copies(copies, 1);
+	return global_nr_copies(copies, 0);
 }
diff --git a/include/meta.h b/include/meta.h
index 99fc38a..338f660 100644
--- a/include/meta.h
+++ b/include/meta.h
@@ -70,4 +70,9 @@ static inline uint64_t bit_to_oid(unsigned long nr)
 	return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT;
 }
 
+static inline unsigned long oid_to_bit(uint64_t oid)
+{
+	return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
+}
+
 #endif
diff --git a/lib/net.c b/lib/net.c
index c85ee2d..ff261e5 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -412,8 +412,11 @@ int read_object(struct sheepdog_node_list_entry *e,
 		addr_to_str(name, sizeof(name), e[n].addr, 0);
 
 		fd = connect_to(name, e[n].port);
-		if (fd < 0)
+		if (fd < 0) {
+			printf("%s(%d): %s, %m\n", __func__, __LINE__,
+			       name);
 			return -1;
+		}
 
 		memset(&hdr, 0, sizeof(hdr));
 		hdr.epoch = node_version;
diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c
index 0d7cecb..55a4fe0 100644
--- a/shepherd/shepherd.c
+++ b/shepherd/shepherd.c
@@ -449,6 +449,8 @@ int parse_vdi(vdi_parser_func_t func, void *data)
 
 		if (ret == sizeof(i))
 			func(i.oid, i.name, i.snap_id, 0, &i, data);
+		else
+			printf("error %lu %" PRIx64 ", %d\n", nr, bit_to_oid(nr), ret);
 
 	}
 
-- 
1.7.0